In [None]:
import pandas as pd 
import numpy as np 
import sklearn 
from scipy import stats 
import matplotlib.pyplot as plt 
import os 
import seaborn as sns 

## For bag of Words 
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from torchsummary import summary

## For Label Encoding 
from sklearn.preprocessing import LabelEncoder 

## Text Preprocessing 
import re
import string 
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 

## TF-IDF (Term Frequency-Inverse Document Frequencies)
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer 

#for model-building
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

# bag of words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

## apply a pipeline 
from sklearn.pipeline import Pipeline 

## other pipelines 
from datetime import datetime

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [None]:
path = "C:\\Users\\tadnan\\OneDrive - Michigan Technological University\\Systematic Review\\Final_files_.csv"

data = pd.read_csv(path)
data

## Exploratory Data Analysis 

### Column and Info Check 

In [None]:
data.info()

In [None]:
data.columns 

## Remove additional spaces 

In [None]:
## Ensure the column anmes are not used with Space 
data.rename(columns=lambda x:x.strip(), inplace=True)
## Replace "Yes " with "Yes"
data['Target'] = data['Target'].str.strip()
data

In [None]:
## Drop Uncecessary Columns 
data.drop(['Unnamed: 0', 'Authors', 'Author full names', 'Author(s) ID', 'Title',
       'Year', 'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start',
       'Page end', 'Page count', 'Cited by', 'DOI', 'Link',
       'Author Keywords', 'PubMed ID', 'Abbreviated Source Title',
       'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID',
       'Unnamed: 25', 'Reasons'], axis=1, inplace=True)


In [None]:
data.head(5)

In [None]:
data['Target'].unique()

## Missing Valyes Handle

In [None]:
data.isnull().any()

In [None]:
data = data.dropna()
data.head(10)

In [None]:
data.shape

In [None]:
### Select Random Samples 
data = data.sample(n=500)
data.to_csv("Final_500_labels.csv")

In [None]:
data.head(10)

In [None]:
data.isnull().any()

## Label Encoding Target 

In [None]:
### Apply label encodung to the "Target" column 
label_encoder = LabelEncoder()
data['Target'] = label_encoder.fit_transform(data['Target'])
print(type(data))
data.shape

In [None]:
## df = data[data['Target'] == 1] 
## type(df)
## df.shape

## Class Distribution 

In [None]:
X=data['Target'].value_counts()
print(X)

In [None]:
## Calculate value counts 
value_counts = data['Target'].value_counts()

## Create a bar plt
sns.barplot(x=value_counts.index, y =value_counts.values)
plt.xlabel('Target')
plt.ylabel('Count')
plt.title("Target Value Counts")
plt.show()

## Text Preprocessing

Clean and transform the raw data into suitable data for further processing. 

Read this to grasp the text preprocessing ideas from this link: https://www.linkedin.com/pulse/text-preprocessing-natural-language-processing-nlp-germec-phd/ 

1. Tokenization: Break the text into smaller units
2. Normalization: Converting texts into standard or common form like (0 to 1) 
3. Stemming: Reduce the words to their base form by removing the suffixes. So simplify the vocabulary. 
4. Lemmatization: This is the processing of reducing words to their root or base form by removing suffixes. For example, "running" can be stemmed to "run". reduce texts and reduce the vocabulary. 
5. Stopword removal: 
6. Punctuation removal: Remove commas, periods, question marks, or other punctuations from your text. 
7. Spelling correction: Correct spelling errors or typos. 

In [None]:
## Create a local directory for NLTK data 
import os 
nltk_data_path = os.path.expanduser('~/nltk_data')
if not os.path.exists(nltk_data_path):
    os.makedirs(nltk_data_path)

In [None]:
## Download the necessary NLTK resources to the local directory 
nltk.download("stopwords", download_dir=nltk_data_path)
nltk.download("punkt", download_dir=nltk_data_path)
nltk.download("wordnert", download_dir=nltk_data_path)
nltk.download("averaged_perceptron_tagger", download_dir=nltk_data_path)

In [None]:
## Set NLTK data path to the local path directory 
nltk.data.path.append(nltk_data_path)

## Convert to lowercase, strip and remove the punctions 

In [None]:
# Define your preprocessing functions
def preprocess(text):
    text = text.lower()
    text = text.strip()
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

## STOPWORD REMOVAL

In [None]:
def stopword(string):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in string.split() if word not in stop_words])

## Lemimatization 

In [None]:
# Initialize the lemmatizer
wl = WordNetLemmatizer()

In [None]:
# This is a helper function to map NTLK position tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
# Tokenize the sentence and lemmatize
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string))  # Get position tags
    a = [wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]  # Map the position tag and lemmatize the word/token
    return " ".join(a)

## Final Preprocessing 

In [None]:
def finalpreprocess(string):
    if isinstance(string, str) and string.strip():  # Check if the input is a non-empty string
        return lemmatizer(stopword(preprocess(string)))
    else:
        return ""  # Return an empty string if the input is not valid

def apply_preprocessing(row):
    try:
        return finalpreprocess(row)
    except Exception as e:
        print(f"Error processing row: {row}")
        print(f"Exception: {e}")
        return ""

In [None]:
# Apply the preprocessing function to the 'Abstract' column
data['Clean_Text_Abstract'] = data['Abstract'].apply(lambda x: apply_preprocessing(x))

In [None]:
## Get the index of the "Abstract column"
data.columns.get_loc('Abstract')

In [None]:
# Create the new column order with 'clean_text' moved to the first position
new_order = ['Clean_Text_Abstract'] + [col for col in data.columns if col != 'Clean_Text_Abstract']

# Reindex the dataframe with the new column order
data = data.reindex(columns=new_order)

## Now drop your 'Abstract' 
data.drop(['Abstract'], axis=1, inplace=True)

data.head(5)

In [None]:
type(data)

In [None]:
data.isnull().any()

## X and Y 

In [None]:
X=data['Clean_Text_Abstract']

In [None]:
Y=data['Target']
Y.head(10)

## Word Vectorization 

It’s difficult to work with text data while building Machine learning models since these models need well-defined numerical data. The process to convert text data into numerical data/vector, is called vectorization or in the NLP world, word embedding. Bag-of-Words(BoW) and Word Embedding (with Word2Vec) are two well-known methods for converting text data to numerical data.

There are a few versions of Bag of Words, corresponding to different words scoring methods. We use the Sklearn library to calculate the BoW numerical values using these approaches: 

Count vectors: It builds a vocabulary from a corpus of documents and counts how many times the words appear in each document. 

Term Frequency-Inverse Document Frequencies (tf-Idf): Count vectors might not be the best representation for converting text data to numerical data. So, instead of simple counting, we can also use an advanced variant of the Bag-of-Words that uses the term frequency–inverse document frequency (or Tf-Idf). Basically, the value of a word increases proportionally to count in the document, but it is inversely proportional to the frequency of the word in the corpus. 

Word2Vec: One of the major drawbacks of using Bag-of-words techniques is that it can’t capture the meaning or relation of the words from vectors. Word2Vec is one of the most popular technique to learn word embeddings using shallow neural network which is capable of capturing context of a word in a document, semantic and syntactic similarity, relation with other words, etc.

use TF-IDF (Term-Frequency-Inverse Document Frequencies): 
Basic Steps: 
1. Instantiate Vectorization 
2. Fit and transform the text data 
3. Convert the TF-IDF to an array 
4. Get the feature names (words) 

In [None]:
# Instantiate the TfidfVectorizer with maximum nr words and ngrams (1: single words, 2: two words in a row)
vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1,2))

# Fit and transform the text data
X_vector = vectorizer.fit_transform(X)

# Convert the TF-IDF matrix to an array
tfidf_array = X_vector.toarray()

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Display the results
print("\nTF-IDF Array:")
print(tfidf_array)
print("\nTotal Number of Features:", len(feature_names))
print("\nFeature Names:")
for feature in feature_names:
    print(feature)

In [None]:
X_vector.shape

In [None]:
type(X_vector)

## Train Test Split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_vector, Y, test_size=0.30, random_state=42, stratify=data['Target'])

In [None]:
print(X_train.shape, y_train.shape) 

In [None]:
print(X_test.shape, y_test.shape)

In [None]:
# Check class distribution in training data
print("Class distribution in y_train:", pd.Series(y_train).value_counts())

In [None]:
y_train.value_counts()/Y.value_counts()

## Now you can see the representation of the train and test datasets by Target

## Apply Machine Learning 

In [None]:
# Train Logistic Regression model
lr_w2v = LogisticRegression(solver='liblinear', C=10, penalty='l2')
lr_w2v.fit(X_train, y_train)

# Predict y value for test dataset
y_predict = lr_w2v.predict(X_test)
y_prob = lr_w2v.predict_proba(X_test)[:, 1]

# Evaluate Logistic Regression model
print("Logistic Regression Model (W2v)")
print("Classification Report:\n", classification_report(y_test, y_predict))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_predict))