### Download the data

In [1]:
!wget https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv

--2022-08-24 18:42:15--  https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5998096 (5.7M) [text/plain]
Saving to: ‘AE_Data.csv’


2022-08-24 18:42:16 (87.9 MB/s) - ‘AE_Data.csv’ saved [5998096/5998096]



### Import Libraries

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
WORDS_TO_REMOVE = ['##padding##', 'ti-', 'ti -']
EXPERIMENT_RESULTS = []

In [4]:
df = pd.read_csv('/content/AE_Data.csv')
df.head()

Unnamed: 0,title,abstract,label
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


In [5]:
df['label'].value_counts()

0    3851
1     294
Name: label, dtype: int64

### Helper Functions

In [6]:
def parse_confusion_matrix(x):
  '''
  Function parses confusion matrix and 
  returns TP, TN, FP and FN for a binary classification model
  '''
  tp = x[0][0]
  tn = x[1][1]
  fp = x[1][0]
  fn = x[0][1]

  return [tp, tn, fp, fn]

def parse_clf_report(x):
  '''
  Functinon parses classificatin report dictionary
  '''

  f1 = x['weighted avg']['f1-score']
  precision = x['weighted avg']['precision']
  recall = x['weighted avg']['recall']
  positive_f1 = x['1']['f1-score']
  positive_precision = x['1']['precision']
  positive_recall = x['1']['recall']

  return [f1, precision, recall, positive_f1, positive_precision, positive_recall]

def prepare_exp_report(master_list, exp_name, confsn_mat, clf_rpt):
  '''
  Function prepares experiment report, 
  in a nutshell it's just concataneting few lists
  '''
  # x is the forest here
  return master_list.append([exp_name] + confsn_mat + clf_rpt)

### Data Cleaning

1. Combine Title and Abstract
2. Lower case entire corpus
2. Remove newline and tabs from the dataset
3. Remove brackets, #, colons, 'TI" (title identifier), '##PADDING##'
5. Lemmatize and remove stopwords
2. Remove records with less than 10 words

In [7]:
df['text'] = df['title'] + ' ' + df['abstract']
df.head()

Unnamed: 0,title,abstract,label,text
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0,antimicrobial impacts of essential oils on foo...
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0,purification and characterization of a cystein...
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0,telavancin activity tested against gram-positi...
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0,the in vitro antimicrobial activity of cymbopo...
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0,screening currency notes for microbial pathoge...


In [8]:
df.replace(r'\n','', regex=True).iloc[4140, :]['text']

'TI  - [AUTO-INFECTION (INTESTINAL) IN RADIATION SICKNESS AND ITS PREVENTION IN WISTAR WHITE RATS]. ##PADDING##'

In [9]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [10]:
eng_stopwords = stopwords.words('english')
stemmer = WordNetLemmatizer()

joined_words_to_remove = '|'.join(WORDS_TO_REMOVE)


def clean_text(x):
  # Lower case the text
  lower_x = x.lower()

  # Remove line breaks and tabs
  no_break_x = re.sub("\n|\r|\t", " ", lower_x)

  # Remove specific words
  no_waste_words_x = re.sub(joined_words_to_remove, " ", no_break_x)

  # Remove all non alphabet, numeral and space characters
  alpha_x = re.sub('[^0-9a-zA-Z ]+', ' ', no_waste_words_x)

  # Remove stopwords and lemmatize the word. Join at the end will also remove multi-spaces
  lemma_x = ' '.join([stemmer.lemmatize(word) for word in alpha_x.split() if word not in eng_stopwords])

  return lemma_x

Let's test the function on few examples

In [11]:
for sample_text in df.sample(5)['text'].values:
  print('ORIGINAL TEXT : ', sample_text)
  print('-'*100)
  print('CLEANED TEXT : ', clean_text(sample_text))
  print('\n')
  print('*'*100)

ORIGINAL TEXT :  quantitative determination by real-time pcr of four vaginal lactobacillus species,other_species andother_species indicates an inverse relationship betweenother_species andother_species.
 background: most studies of the vaginal microflora have been based on culture or  on qualitative molecular techniques. here we applied existing real-time pcr formats forother_species,other_species andother_species and developed new formats forother_species,other_species andother_species to obtain a quantitative non culture-based determination of these species in 71 vaginal samples from 32 pregnant and 28 non-pregnant women aged between 18 and 45 years.  results: the 71 vaginal microflora samples of these women were categorized, using the ison and hay criteria, as refined by verhelst et al. (2005), as follows: grade ia: 8 samples, grade iab: 10, grade ib: 13, grade i-like: 10, grade ii: 11, grade iii: 12 and grade iv: 7.l. crispatus was found in all but 5 samples and was the most freque

In [12]:
# Apply cleaning function to the text field
df['clean_text'] = df['text'].apply(lambda x : clean_text(x))

# Get the length and drop records less than 10 words
df['text_len'] = df['clean_text'].str.split().apply(len)

cleaned_df = df[df['text_len'] > 10][['clean_text', 'label']]

In [13]:
cleaned_df.head()

Unnamed: 0,clean_text,label
0,antimicrobial impact essential oil food borne ...,0
1,purification characterization cysteine rich 14...,0
2,telavancin activity tested gram positive clini...,0
3,vitro antimicrobial activity cymbopogon essent...,0
4,screening currency note microbial pathogen ant...,0


In [14]:
print('Total records retained after data cleaning : ', cleaned_df.shape[0])
cleaned_df['label'].value_counts()

Total records retained after data cleaning :  4013


0    3719
1     294
Name: label, dtype: int64

### TF-IDF Vectorizer

Convert text to features. We'll use TF-IDF score to give the score to the word in the corpus. 

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(cleaned_df['clean_text']).toarray()
y = cleaned_df['label'].values

Since the data is unbalanced, we need to split the data into train-test in such a way that those represent the actual data. That's where stratified sampling comes in

In [16]:
from sklearn.model_selection import train_test_split

test_split = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, stratify=y)

In [17]:
print('Label Distribution in the training data')
print(np.unique(y_train, return_counts=True))
print('*'*50)
print('Label Distribution in the testing data')
print(np.unique(y_test, return_counts=True))

Label Distribution in the training data
(array([0, 1]), array([2975,  235]))
**************************************************
Label Distribution in the testing data
(array([0, 1]), array([744,  59]))


Now that the text has been converted into features, we can model the data 

## Machine Learning Models

The dataset at hand is so imbalanced that accuracy on the predictions is not a good metric to judge a model. We'll use Classification Report, and more importantly F1 Score for model comparison. Also to note, we want to reduce False negatives as much as we can, we don't want to classify a doc non-adverse event if it's in fact a adverse-event related

In [18]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Experiment 1 - Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

# Experiment Name
exp_name1 = 'Random Forest'

# Initialize the model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name1, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[742   2]
 [ 30  29]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.961140   0.935484  0.960149    0.948312      0.959255
recall       0.997312   0.491525  0.960149    0.744419      0.960149
f1-score     0.978892   0.644444  0.960149    0.811668      0.954318
support    744.000000  59.000000  0.960149  803.000000    803.000000


Performance on 0s is satisfactory, but 1s are pretty terrible!

### Experiment 2 - Multinomial NB

In [20]:
from sklearn.naive_bayes import MultinomialNB

# Model Name
exp_name2 = 'Multinomial Naive Bayes'

# Initialize the model
classifier = MultinomialNB()

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name2, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[736   8]
 [ 38  21]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.950904   0.724138  0.942715    0.837521      0.934243
recall       0.989247   0.355932  0.942715    0.672590      0.942715
f1-score     0.969697   0.477273  0.942715    0.723485      0.933516
support    744.000000  59.000000  0.942715  803.000000    803.000000


Even worse!

### Experiment 3 - SVM : SGD Classifier

In [21]:
from sklearn.linear_model import SGDClassifier

# Model Name
exp_name3 = 'SVM'

# Initialize the model
classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name3, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[741   3]
 [ 48  11]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.939163   0.785714  0.936488    0.862439      0.927889
recall       0.995968   0.186441  0.936488    0.591204      0.936488
f1-score     0.966732   0.301370  0.936488    0.634051      0.917845
support    744.000000  59.000000  0.936488  803.000000    803.000000




Better performance than MNB, but worse than Random Forest

### Experiment 4 - K-Nearest Neighbors

In [22]:
from sklearn.neighbors import KNeighborsClassifier

# Model Name
exp_name4 = 'k-NN'

# Initialize the model
classifier = KNeighborsClassifier(n_neighbors=2)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name4, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[739   5]
 [ 36  23]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.953548   0.821429  0.948941    0.887488      0.943841
recall       0.993280   0.389831  0.948941    0.691555      0.948941
f1-score     0.973009   0.528736  0.948941    0.750872      0.940366
support    744.000000  59.000000  0.948941  803.000000    803.000000


Not a good idea, TBH!

### Experiment 5 - Decision Trees

In [23]:
from sklearn.tree import DecisionTreeClassifier

# Model Name
exp_name5 = 'Decision Tree'

# Initialize the model
classifier = DecisionTreeClassifier(max_depth=5)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name5, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[728  16]
 [ 33  26]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.956636   0.619048  0.938979    0.787842      0.931832
recall       0.978495   0.440678  0.938979    0.709586      0.938979
f1-score     0.967442   0.514851  0.938979    0.741147      0.934188
support    744.000000  59.000000  0.938979  803.000000    803.000000


A single tree is not working better than a Random Forest. Proves some theories 😀

### Experiment 6 - AdaBoost (discrete SAMME)

In [24]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name6 = 'Adaboost (Discrete SAMME)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name6, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[729  15]
 [ 23  36]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.969415   0.705882  0.952677    0.837649      0.950052
recall       0.979839   0.610169  0.952677    0.795004      0.952677
f1-score     0.974599   0.654545  0.952677    0.814572      0.951083
support    744.000000  59.000000  0.952677  803.000000    803.000000


That's a considerable improvement!

### Experiment 7 - AdaBoost (real SAMME.R)

In [25]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name7 = 'Adaboost (Real SAMME.R)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name7, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[738   6]
 [ 31  28]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.959688   0.823529  0.953923    0.891609      0.949684
recall       0.991935   0.474576  0.953923    0.733256      0.953923
f1-score     0.975545   0.602151  0.953923    0.788848      0.948110
support    744.000000  59.000000  0.953923  803.000000    803.000000


Worse than discrete SAMME algorithm, but still better than other models

### Experiment 8 - Voting Classifier

Voting Classifiers take votes from individual algorithms (panel experts!) about the classification and make decision based on the voting type (Hard or Soft). 

Difference b/w Hard and Soft Voting - Hard voting involves summing the predictions for each class label and predicting the class label with the most votes. Soft voting involves summing the predicted probabilities (or probability-like scores) for each class label and predicting the class label with the largest probability.

In [26]:
from sklearn.ensemble import VotingClassifier

# Experiment Name
exp_name8 = 'Voting Classifier'

# Model 1 : Adaboost with SAMME.R
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

adaboost_clf = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Model 2 : Decision Trees
dt_clf = DecisionTreeClassifier(max_depth=5)

# Model 3 : Random Forest
rf_clf = RandomForestClassifier(n_estimators=1000, random_state=0)

# Initialize the classifer
voting_clf = VotingClassifier(estimators=[('Adaboost', adaboost_clf), ('DTree', dt_clf), ('RF', rf_clf)], voting='hard')

# Fit the model to the data
voting_clf.fit(X_train, y_train)

# Generate predictions
y_pred = voting_clf.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name8, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[739   5]
 [ 30  29]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.960988   0.852941  0.956413    0.906965      0.953050
recall       0.993280   0.491525  0.956413    0.742402      0.956413
f1-score     0.976867   0.623656  0.956413    0.800262      0.950915
support    744.000000  59.000000  0.956413  803.000000    803.000000


### Experiment 9 - Grid Search with Voting Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

# Experiment Name
exp_name9 = 'GridSearch with VotingClassifier'

eclf = VotingClassifier(estimators=[ 
    ('svm', SVC(probability=True)),
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier())
    ], voting='soft')

# Define Param grid
params = {'lr__C': [1.0, 100.0],
          'svm__C': [2,3,4],
          'rf__n_estimators' : [50,200,500]}

# Initialize Grid Search 
grid = GridSearchCV(estimator=eclf, param_grid=params)

# Fit the model to the data
grid.fit(X_train, y_train)

# Generate predictions
y_pred = grid.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name9, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

In [33]:
# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name9, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[735   9]
 [ 26  33]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.965834   0.785714  0.956413    0.875774      0.952600
recall       0.987903   0.559322  0.956413    0.773613      0.956413
f1-score     0.976744   0.653465  0.956413    0.815105      0.952991
support    744.000000  59.000000  0.956413  803.000000    803.000000


In [35]:
SCHEMA = ['Experiment Name', 'True Positives', 'True Negatives', 'False Positives', 
          'False Negatives', 'Overall F1 Score','Overall Precision','Overall Recall',
          'F1 for Positives Records','Precision for Positive Records', 'Recall for Positive Records']

 
pd.DataFrame(EXPERIMENT_RESULTS, columns=SCHEMA)

Unnamed: 0,Experiment Name,True Positives,True Negatives,False Positives,False Negatives,Overall F1 Score,Overall Precision,Overall Recall,F1 for Positives Records,Precision for Positive Records,Recall for Positive Records
0,Random Forest,742,29,30,2,0.954318,0.959255,0.960149,0.644444,0.935484,0.491525
1,Multinomial Naive Bayes,736,21,38,8,0.933516,0.934243,0.942715,0.477273,0.724138,0.355932
2,SVM,741,11,48,3,0.917845,0.927889,0.936488,0.30137,0.785714,0.186441
3,k-NN,739,23,36,5,0.940366,0.943841,0.948941,0.528736,0.821429,0.389831
4,Decision Tree,728,26,33,16,0.934188,0.931832,0.938979,0.514851,0.619048,0.440678
5,Adaboost (Discrete SAMME),729,36,23,15,0.951083,0.950052,0.952677,0.654545,0.705882,0.610169
6,Adaboost (Real SAMME.R),738,28,31,6,0.94811,0.949684,0.953923,0.602151,0.823529,0.474576
7,Voting Classifier,739,29,30,5,0.950915,0.95305,0.956413,0.623656,0.852941,0.491525
8,GridSearch with VotingClassifier,735,33,26,9,0.952991,0.9526,0.956413,0.653465,0.785714,0.559322
