### Download the data

In [28]:
!wget https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv

--2022-08-23 22:11:46--  https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5998096 (5.7M) [text/plain]
Saving to: ‘AE_Data.csv.3’


2022-08-23 22:11:47 (93.8 MB/s) - ‘AE_Data.csv.3’ saved [5998096/5998096]



### Import Libraries

In [29]:
import pandas as pd
import numpy as np
import re

In [30]:
WORDS_TO_REMOVE = ['##padding##', 'ti-', 'ti -']
EXPERIMENT_RESULTS = []

In [31]:
df = pd.read_csv('/content/AE_Data.csv')
df.head()

Unnamed: 0,title,abstract,label
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


In [32]:
df['label'].value_counts()

0    3851
1     294
Name: label, dtype: int64

### Helper Functions

In [33]:
def parse_confusion_matrix(x):
  '''
  Function parses confusion matrix and 
  returns TP, TN, FP and FN for a binary classification model
  '''
  tp = x[0][0]
  tn = x[1][1]
  fp = x[1][0]
  fn = x[0][1]

  return [tp, tn, fp, fn]

def parse_clf_report(x):
  '''
  Functinon parses classificatin report dictionary
  '''

  f1 = x['weighted avg']['f1-score']
  precision = x['weighted avg']['precision']
  recall = x['weighted avg']['recall']
  positive_f1 = x['1']['f1-score']
  positive_precision = x['1']['precision']
  positive_recall = x['1']['recall']

  return [f1, precision, recall, positive_f1, positive_precision, positive_recall]

def prepare_exp_report(master_list, exp_name, confsn_mat, clf_rpt):
  '''
  Function prepares experiment report, 
  in a nutshell it's just concataneting few lists
  '''
  # x is the forest here
  return master_list.append([exp_name] + confsn_mat + clf_rpt)

### Data Cleaning

1. Combine Title and Abstract
2. Lower case entire corpus
2. Remove newline and tabs from the dataset
3. Remove brackets, #, colons, 'TI" (title identifier), '##PADDING##'
5. Lemmatize and remove stopwords
2. Remove records with less than 10 words

In [34]:
df['text'] = df['title'] + ' ' + df['abstract']
df.head()

Unnamed: 0,title,abstract,label,text
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0,antimicrobial impacts of essential oils on foo...
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0,purification and characterization of a cystein...
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0,telavancin activity tested against gram-positi...
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0,the in vitro antimicrobial activity of cymbopo...
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0,screening currency notes for microbial pathoge...


In [35]:
df.replace(r'\n','', regex=True).iloc[4140, :]['text']

'TI  - [AUTO-INFECTION (INTESTINAL) IN RADIATION SICKNESS AND ITS PREVENTION IN WISTAR WHITE RATS]. ##PADDING##'

In [36]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [37]:
eng_stopwords = stopwords.words('english')
stemmer = WordNetLemmatizer()

joined_words_to_remove = '|'.join(WORDS_TO_REMOVE)


def clean_text(x):
  # Lower case the text
  lower_x = x.lower()

  # Remove line breaks and tabs
  no_break_x = re.sub("\n|\r|\t", " ", lower_x)

  # Remove specific words
  no_waste_words_x = re.sub(joined_words_to_remove, " ", no_break_x)

  # Remove all non alphabet, numeral and space characters
  alpha_x = re.sub('[^0-9a-zA-Z ]+', ' ', no_waste_words_x)

  # Remove stopwords and lemmatize the word. Join at the end will also remove multi-spaces
  lemma_x = ' '.join([stemmer.lemmatize(word) for word in alpha_x.split() if word not in eng_stopwords])

  return lemma_x

Let's test the function on few examples

In [38]:
for sample_text in df.sample(5)['text'].values:
  print('ORIGINAL TEXT : ', sample_text)
  print('-'*100)
  print('CLEANED TEXT : ', clean_text(sample_text))
  print('\n')
  print('*'*100)

ORIGINAL TEXT :  TI  - Role of unbalanced growth of gram-negative bacteria in ileal ulcer formation in rats treated with a nonsteroidal anti-inflammatory drug.
 AB  - Nonsteroidal anti-inflammatory drugs (NSAIDs) induced formation of intestinal ulcers as side effects, in which an unbalanced increase in the number of gram-negative bacteria in the small intestine plays an important role. To clarify how intestinal microflora are influenced by NSAIDs, we examined the effects of 5-bromo-2-(4-fluorophenyl)-3-(4-methylsulfonylphenyl) thiophene (BFMeT), an NSAID, on intestinal motility and on the growth of Escherichia coli and Lactobacillus acidophilus. Transit index, a marker of peristalsis, was not different in BFMeT-treated and solvent-treated rats, indicating that BFMeT increased the number of gram-negative bacteria without suppression of peristalsis. The factors that affect the growth of intestinal bacteria were not found in intestinal contents of BFMeT-treated rats, because the growth of

In [39]:
# Apply cleaning function to the text field
df['clean_text'] = df['text'].apply(lambda x : clean_text(x))

# Get the length and drop records less than 10 words
df['text_len'] = df['clean_text'].str.split().apply(len)

cleaned_df = df[df['text_len'] > 10][['clean_text', 'label']]

In [40]:
cleaned_df.head()

Unnamed: 0,clean_text,label
0,antimicrobial impact essential oil food borne ...,0
1,purification characterization cysteine rich 14...,0
2,telavancin activity tested gram positive clini...,0
3,vitro antimicrobial activity cymbopogon essent...,0
4,screening currency note microbial pathogen ant...,0


In [41]:
print('Total records retained after data cleaning : ', cleaned_df.shape[0])
cleaned_df['label'].value_counts()

Total records retained after data cleaning :  4013


0    3719
1     294
Name: label, dtype: int64

### TF-IDF Vectorizer

Convert text to features. We'll use TF-IDF score to give the score to the word in the corpus. 

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(cleaned_df['clean_text']).toarray()
y = cleaned_df['label'].values

Since the data is unbalanced, we need to split the data into train-test in such a way that those represent the actual data. That's where stratified sampling comes in

In [43]:
from sklearn.model_selection import train_test_split

test_split = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, stratify=y)

In [44]:
print('Label Distribution in the training data')
print(np.unique(y_train, return_counts=True))
print('*'*50)
print('Label Distribution in the testing data')
print(np.unique(y_test, return_counts=True))

Label Distribution in the training data
(array([0, 1]), array([2975,  235]))
**************************************************
Label Distribution in the testing data
(array([0, 1]), array([744,  59]))


Now that the text has been converted into features, we can model the data 

## Machine Learning Models

The dataset at hand is so imbalanced that accuracy on the predictions is not a good metric to judge a model. We'll use Classification Report, and more importantly F1 Score for model comparison. Also to note, we want to reduce False negatives as much as we can, we don't want to classify a doc non-adverse event if it's in fact a adverse-event related

In [45]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Experiment 1 - Random Forest

In [46]:
from sklearn.ensemble import RandomForestClassifier

# Experiment Name
exp_name1 = 'Random Forest'

# Initialize the model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name1, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[742   2]
 [ 30  29]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.961140   0.935484  0.960149    0.948312      0.959255
recall       0.997312   0.491525  0.960149    0.744419      0.960149
f1-score     0.978892   0.644444  0.960149    0.811668      0.954318
support    744.000000  59.000000  0.960149  803.000000    803.000000


Performance on 0s is satisfactory, but 1s are pretty terrible!

### Experiment 2 - Multinomial NB

In [47]:
from sklearn.naive_bayes import MultinomialNB

# Model Name
exp_name2 = 'Multinomial Naive Bayes'

# Initialize the model
classifier = MultinomialNB()

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name2, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[734  10]
 [ 39  20]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.949547   0.666667  0.938979    0.808107      0.928763
recall       0.986559   0.338983  0.938979    0.662771      0.938979
f1-score     0.967699   0.449438  0.938979    0.708569      0.929620
support    744.000000  59.000000  0.938979  803.000000    803.000000


Even worse!

### Experiment 3 - SVM : SGD Classifier

In [48]:
from sklearn.linear_model import SGDClassifier

# Model Name
exp_name3 = 'SVM'

# Initialize the model
classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name3, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[742   2]
 [ 51   8]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.935687   0.800000  0.933998    0.867844      0.925718
recall       0.997312   0.135593  0.933998    0.566453      0.933998
f1-score     0.965517   0.231884  0.933998    0.598701      0.911614
support    744.000000  59.000000  0.933998  803.000000    803.000000




Better performance than MNB, but worse than Random Forest

### Experiment 4 - K-Nearest Neighbors

In [49]:
from sklearn.neighbors import KNeighborsClassifier

# Model Name
exp_name4 = 'k-NN'

# Initialize the model
classifier = KNeighborsClassifier(n_neighbors=2)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name4, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[733  11]
 [ 35  24]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.954427   0.685714  0.942715    0.820071      0.934684
recall       0.985215   0.406780  0.942715    0.695997      0.942715
f1-score     0.969577   0.510638  0.942715    0.740108      0.935856
support    744.000000  59.000000  0.942715  803.000000    803.000000


Not a good idea, TBH!

### Experiment 5 - Decision Trees

In [50]:
from sklearn.tree import DecisionTreeClassifier

# Model Name
exp_name5 = 'Decision Tree'

# Initialize the model
classifier = DecisionTreeClassifier(max_depth=5)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name5, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[732  12]
 [ 42  17]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.945736   0.586207  0.932752    0.765972      0.919320
recall       0.983871   0.288136  0.932752    0.636003      0.932752
f1-score     0.964427   0.386364  0.932752    0.675395      0.921954
support    744.000000  59.000000  0.932752  803.000000    803.000000


A single tree is not working better than a Random Forest. Proves some theories 😀

### Experiment 6 - AdaBoost (discrete SAMME)

In [51]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name6 = 'Adaboost (Discrete SAMME)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name6, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[733  11]
 [ 26  33]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.965744   0.750000  0.953923    0.857872      0.949893
recall       0.985215   0.559322  0.953923    0.772269      0.953923
f1-score     0.975383   0.640777  0.953923    0.808080      0.950798
support    744.000000  59.000000  0.953923  803.000000    803.000000


That's a considerable improvement!

### Experiment 7 - AdaBoost (real SAMME.R)

In [52]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name7 = 'Adaboost (Real SAMME.R)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name7, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[740   4]
 [ 26  33]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.966057   0.891892   0.96264    0.928975      0.960608
recall       0.994624   0.559322   0.96264    0.776973      0.962640
f1-score     0.980132   0.687500   0.96264    0.833816      0.958631
support    744.000000  59.000000   0.96264  803.000000    803.000000


Worse than discrete SAMME algorithm, but still better than other models

### Experiment 8 - Voting Classifier

Voting Classifiers take votes from individual algorithms (panel experts!) about the classification and make decision based on the voting type (Hard or Soft). 

Difference b/w Hard and Soft Voting - Hard voting involves summing the predictions for each class label and predicting the class label with the most votes. Soft voting involves summing the predicted probabilities (or probability-like scores) for each class label and predicting the class label with the largest probability.

In [53]:
from sklearn.ensemble import VotingClassifier

# Experiment Name
exp_name8 = 'Voting Classifier'

# Model 1 : Adaboost with SAMME.R
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

adaboost_clf = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Model 2 : Decision Trees
dt_clf = DecisionTreeClassifier(max_depth=5)

# Model 3 : Random Forest
rf_clf = RandomForestClassifier(n_estimators=1000, random_state=0)

# Initialize the classifer
voting_clf = VotingClassifier(estimators=[('Adaboost', adaboost_clf), ('DTree', dt_clf), ('RF', rf_clf)], voting='hard')

# Fit the model to the data
voting_clf.fit(X_train, y_train)

# Generate predictions
y_pred = voting_clf.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name8, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[741   3]
 [ 28  31]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.963589   0.911765  0.961395    0.937677      0.959781
recall       0.995968   0.525424  0.961395    0.760696      0.961395
f1-score     0.979511   0.666667  0.961395    0.823089      0.956525
support    744.000000  59.000000  0.961395  803.000000    803.000000


In [55]:
SCHEMA = ['Experiment Name', 'True Positives', 'True Negatives', 'False Positives', 
          'False Negatives', 'Overall F1 Score','Overall Precision','Overall Recall',
          'F1 for Positives Records','Precision for Positive Records', 'Recall for Positive Records']


pd.DataFrame(EXPERIMENT_RESULTS, columns=SCHEMA)

Unnamed: 0,Experiment Name,True Positives,True Negatives,False Positives,False Negatives,Overall F1 Score,Overall Precision,Overall Recall,F1 for Positives Records,Precision for Positive Records,Recall for Positive Records
0,Random Forest,742,29,30,2,0.954318,0.959255,0.960149,0.644444,0.935484,0.491525
1,Multinomial Naive Bayes,734,20,39,10,0.92962,0.928763,0.938979,0.449438,0.666667,0.338983
2,SVM,742,8,51,2,0.911614,0.925718,0.933998,0.231884,0.8,0.135593
3,k-NN,733,24,35,11,0.935856,0.934684,0.942715,0.510638,0.685714,0.40678
4,Decision Tree,732,17,42,12,0.921954,0.91932,0.932752,0.386364,0.586207,0.288136
5,Adaboost (Discrete SAMME),733,33,26,11,0.950798,0.949893,0.953923,0.640777,0.75,0.559322
6,Adaboost (Real SAMME.R),740,33,26,4,0.958631,0.960608,0.96264,0.6875,0.891892,0.559322
7,Voting Classifier,741,31,28,3,0.956525,0.959781,0.961395,0.666667,0.911765,0.525424
