### Download the data

In [None]:
!wget https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv

--2022-08-25 20:27:54--  https://raw.githubusercontent.com/suvigyajain0101/CaseStudies/main/AdverseEventClassification/Data/AE_Data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5998096 (5.7M) [text/plain]
Saving to: ‘AE_Data.csv’


2022-08-25 20:27:55 (71.4 MB/s) - ‘AE_Data.csv’ saved [5998096/5998096]



### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
WORDS_TO_REMOVE = ['##padding##', 'ti-', 'ti -']
EXPERIMENT_RESULTS = []

In [None]:
df = pd.read_csv('/content/AE_Data.csv')
df.head()

Unnamed: 0,title,abstract,label
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0


In [None]:
df['label'].value_counts()

0    3851
1     294
Name: label, dtype: int64

### Helper Functions

In [None]:
def parse_confusion_matrix(x):
  '''
  Function parses confusion matrix and 
  returns TP, TN, FP and FN for a binary classification model
  '''
  tn, fp, fn, tp = x.ravel()

  return [tp, tn, fp, fn]

def parse_clf_report(x):
  '''
  Functinon parses classificatin report dictionary
  '''

  f1 = x['weighted avg']['f1-score']
  precision = x['weighted avg']['precision']
  recall = x['weighted avg']['recall']
  positive_f1 = x['1']['f1-score']
  positive_precision = x['1']['precision']
  positive_recall = x['1']['recall']

  return [f1, precision, recall, positive_f1, positive_precision, positive_recall]

def prepare_exp_report(master_list, exp_name, confsn_mat, clf_rpt):
  '''
  Function prepares experiment report, 
  in a nutshell it's just concataneting few lists
  '''
  # x is the forest here
  return master_list.append([exp_name] + confsn_mat + clf_rpt)

### Data Cleaning

1. Combine Title and Abstract
2. Lower case entire corpus
2. Remove newline and tabs from the dataset
3. Remove brackets, #, colons, 'TI" (title identifier), '##PADDING##'
5. Lemmatize and remove stopwords
2. Remove records with less than 10 words

In [None]:
df['text'] = df['title'] + ' ' + df['abstract']
df.head()

Unnamed: 0,title,abstract,label,text
0,antimicrobial impacts of essential oils on foo...,the antimicrobial activity of twelve essential...,0,antimicrobial impacts of essential oils on foo...
1,purification and characterization of a cystein...,antimicrobial peptide (amp) crustin is a type ...,0,purification and characterization of a cystein...
2,telavancin activity tested against gram-positi...,objectives: to reassess the activity of telava...,0,telavancin activity tested against gram-positi...
3,the in vitro antimicrobial activity of cymbopo...,background: it is well known that cymbopogon (...,0,the in vitro antimicrobial activity of cymbopo...
4,screening currency notes for microbial pathoge...,fomites are a well-known source of microbial i...,0,screening currency notes for microbial pathoge...


In [None]:
df.replace(r'\n','', regex=True).iloc[4140, :]['text']

'TI  - [AUTO-INFECTION (INTESTINAL) IN RADIATION SICKNESS AND ITS PREVENTION IN WISTAR WHITE RATS]. ##PADDING##'

In [None]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
eng_stopwords = stopwords.words('english')
stemmer = WordNetLemmatizer()

joined_words_to_remove = '|'.join(WORDS_TO_REMOVE)


def clean_text(x):
  # Lower case the text
  lower_x = x.lower()

  # Remove line breaks and tabs
  no_break_x = re.sub("\n|\r|\t", " ", lower_x)

  # Remove specific words
  no_waste_words_x = re.sub(joined_words_to_remove, " ", no_break_x)

  # Remove all non alphabet, numeral and space characters
  alpha_x = re.sub('[^0-9a-zA-Z ]+', ' ', no_waste_words_x)

  # Remove stopwords and lemmatize the word. Join at the end will also remove multi-spaces
  lemma_x = ' '.join([stemmer.lemmatize(word) for word in alpha_x.split() if word not in eng_stopwords])

  return lemma_x

Let's test the function on few examples

In [None]:
for sample_text in df.sample(5)['text'].values:
  print('ORIGINAL TEXT : ', sample_text)
  print('-'*100)
  print('CLEANED TEXT : ', clean_text(sample_text))
  print('\n')
  print('*'*100)

ORIGINAL TEXT :  binding of streptococcal antigens to muscle tissue in vitro.
 antigens extracted from cells of streptococcus pyogenes t6 and streptococcus mutans strains aht, bht, 10449, omz175, and k1r adsorbed to the sarcolemmal sheath of cardiac muscle cells in vitro. similar preparations from s. salivarius, s. sanguis, staphylococcus aureus, and lactobacillus casei had weak or negligible tissue-binding activity. tissue-bound bacterial antigens were detected with homologous rabbit antisera with both indirect immunofluorescence tests and an indirect radioimmunoassay. serological cross-reactivity was observed between the  tissue-binding factors of s. pyogenes and s. mutans cells but not between the bacteria and muscle tissue. in a comparative study of extraction procedures, the  greatest yield of tissue-binding factors was obtained from group a streptococci by cell disruption in buffer at 4 degrees c. hot aqueous phenol and hot water extracts were inactive. antibodies specific for th

In [None]:
# Apply cleaning function to the text field
df['clean_text'] = df['text'].apply(lambda x : clean_text(x))

# Get the length and drop records less than 10 words
df['text_len'] = df['clean_text'].str.split().apply(len)

cleaned_df = df[df['text_len'] > 10][['clean_text', 'label']]

In [None]:
cleaned_df.head()

Unnamed: 0,clean_text,label
0,antimicrobial impact essential oil food borne ...,0
1,purification characterization cysteine rich 14...,0
2,telavancin activity tested gram positive clini...,0
3,vitro antimicrobial activity cymbopogon essent...,0
4,screening currency note microbial pathogen ant...,0


In [None]:
print('Total records retained after data cleaning : ', cleaned_df.shape[0])
cleaned_df['label'].value_counts()

Total records retained after data cleaning :  4013


0    3719
1     294
Name: label, dtype: int64

### TF-IDF Vectorizer

Convert text to features. We'll use TF-IDF score to give the score to the word in the corpus. 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(cleaned_df['clean_text']).toarray()
y = cleaned_df['label'].values

Since the data is unbalanced, we need to split the data into train-test in such a way that those represent the actual data. That's where stratified sampling comes in

In [None]:
from sklearn.model_selection import train_test_split

test_split = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_split, stratify=y)

In [None]:
print('Label Distribution in the training data')
print(np.unique(y_train, return_counts=True))
print('*'*50)
print('Label Distribution in the testing data')
print(np.unique(y_test, return_counts=True))

Label Distribution in the training data
(array([0, 1]), array([2975,  235]))
**************************************************
Label Distribution in the testing data
(array([0, 1]), array([744,  59]))


Now that the text has been converted into features, we can model the data 

## Machine Learning Models

The dataset at hand is so imbalanced that accuracy on the predictions is not a good metric to judge a model. We'll use Classification Report, and more importantly F1 Score for model comparison. Also to note, we want to reduce False negatives as much as we can, we don't want to classify a doc non-adverse event if it's in fact a adverse-event related

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Experiment 1 - Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Experiment Name
exp_name1 = 'Random Forest'

# Initialize the model
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name1, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[742   2]
 [ 28  31]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.963636   0.939394   0.96264    0.951515      0.961855
recall       0.997312   0.525424   0.96264    0.761368      0.962640
f1-score     0.980185   0.673913   0.96264    0.827049      0.957682
support    744.000000  59.000000   0.96264  803.000000    803.000000


Performance on 0s is satisfactory, but 1s are pretty terrible!

### Experiment 2 - Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Model Name
exp_name2 = 'Multinomial Naive Bayes'

# Initialize the model
classifier = MultinomialNB()

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name2, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[736   8]
 [ 39  20]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.949677   0.714286  0.941469    0.831982      0.932382
recall       0.989247   0.338983  0.941469    0.664115      0.941469
f1-score     0.969059   0.459770  0.941469    0.714414      0.931639
support    744.000000  59.000000  0.941469  803.000000    803.000000


Even worse!

### Experiment 3 - SVM : SGD Classifier

In [None]:
from sklearn.linear_model import SGDClassifier

# Model Name
exp_name3 = 'SVM'

# Initialize the model
classifier = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=5, random_state=42)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name3, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[741   3]
 [ 40  19]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.948784   0.863636  0.946451    0.906210      0.942527
recall       0.995968   0.322034  0.946451    0.659001      0.946451
f1-score     0.971803   0.469136  0.946451    0.720470      0.934870
support    744.000000  59.000000  0.946451  803.000000    803.000000




Better performance than MNB, but worse than Random Forest

### Experiment 4 - K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Model Name
exp_name4 = 'k-NN'

# Initialize the model
classifier = KNeighborsClassifier(n_neighbors=2)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name4, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[736   8]
 [ 38  21]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.950904   0.724138  0.942715    0.837521      0.934243
recall       0.989247   0.355932  0.942715    0.672590      0.942715
f1-score     0.969697   0.477273  0.942715    0.723485      0.933516
support    744.000000  59.000000  0.942715  803.000000    803.000000


Not a good idea, TBH!

### Experiment 5 - Decision Trees

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Model Name
exp_name5 = 'Decision Tree'

# Initialize the model
classifier = DecisionTreeClassifier(max_depth=5)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name5, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[726  18]
 [ 37  22]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.951507   0.550000  0.931507    0.750754      0.922007
recall       0.975806   0.372881  0.931507    0.674344      0.931507
f1-score     0.963504   0.444444  0.931507    0.703974      0.925366
support    744.000000  59.000000  0.931507  803.000000    803.000000


A single tree is not working better than a Random Forest. Proves some theories 😀

### Experiment 6 - AdaBoost (discrete SAMME)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name6 = 'Adaboost (Discrete SAMME)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name6, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[737   7]
 [ 22  37]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.971014   0.840909  0.963885    0.905962      0.961455
recall       0.990591   0.627119  0.963885    0.808855      0.963885
f1-score     0.980705   0.718447  0.963885    0.849576      0.961436
support    744.000000  59.000000  0.963885  803.000000    803.000000


That's a considerable improvement!

### Experiment 7 - AdaBoost (real SAMME.R)

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# Experiment Name
exp_name7 = 'Adaboost (Real SAMME.R)'

# Base Model for boosting
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

# Initialize the model
classifier = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Fit the model to the data
classifier.fit(X_train, y_train)

# Generate predictions
y_pred = classifier.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name7, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[739   5]
 [ 25  34]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.967277   0.871795   0.96264    0.919536      0.960262
recall       0.993280   0.576271   0.96264    0.784775      0.962640
f1-score     0.980106   0.693878   0.96264    0.836992      0.959076
support    744.000000  59.000000   0.96264  803.000000    803.000000


Worse than discrete SAMME algorithm, but still better than other models

### Experiment 8 - Voting Classifier

Voting Classifiers take votes from individual algorithms (panel experts!) about the classification and make decision based on the voting type (Hard or Soft). 

Difference b/w Hard and Soft Voting - Hard voting involves summing the predictions for each class label and predicting the class label with the most votes. Soft voting involves summing the predicted probabilities (or probability-like scores) for each class label and predicting the class label with the largest probability.

In [None]:
from sklearn.ensemble import VotingClassifier

# Experiment Name
exp_name8 = 'Voting Classifier'

# Model 1 : Adaboost with SAMME.R
base_model = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)

adaboost_clf = AdaBoostClassifier(
    base_estimator=base_model,
    learning_rate=1.0,
    n_estimators=400,
    algorithm="SAMME.R",
)

# Model 2 : Decision Trees
dt_clf = DecisionTreeClassifier(max_depth=5)

# Model 3 : Random Forest
rf_clf = RandomForestClassifier(n_estimators=1000, random_state=0)

# Initialize the classifer
voting_clf = VotingClassifier(estimators=[('Adaboost', adaboost_clf), ('DTree', dt_clf), ('RF', rf_clf)], voting='hard')

# Fit the model to the data
voting_clf.fit(X_train, y_train)

# Generate predictions
y_pred = voting_clf.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name8, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[736   8]
 [ 25  34]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.967148   0.809524  0.958904    0.888336      0.955567
recall       0.989247   0.576271  0.958904    0.782759      0.958904
f1-score     0.978073   0.673267  0.958904    0.825670      0.955678
support    744.000000  59.000000  0.958904  803.000000    803.000000


### Experiment 9 - Grid Search with Voting Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

# Experiment Name
exp_name9 = 'GridSearch with VotingClassifier'

eclf = VotingClassifier(estimators=[ 
    ('svm', SVC(probability=True)),
    ('lr', LogisticRegression()),
    ('rf', RandomForestClassifier())
    ], voting='soft')

# Define Param grid
params = {'lr__C': [1.0, 100.0],
          'svm__C': [2,3,4],
          'rf__n_estimators' : [50,200,500]}

# Initialize Grid Search 
grid = GridSearchCV(estimator=eclf, param_grid=params)

# Fit the model to the data
grid.fit(X_train, y_train)

# Generate predictions
y_pred = grid.predict(X_test)

# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name9, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

[[734  10]
 [ 21  38]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.972185   0.791667  0.961395    0.881926      0.958922
recall       0.986559   0.644068  0.961395    0.815313      0.961395
f1-score     0.979320   0.710280  0.961395    0.844800      0.959552
support    744.000000  59.000000  0.961395  803.000000    803.000000


In [None]:
# Prepare Classification Report
exp_confusion_matrix = confusion_matrix(y_test,y_pred)
exp_clf_report = classification_report(y_test,y_pred, output_dict=True)
print(exp_confusion_matrix)
print('\n')
print(pd.DataFrame(exp_clf_report))

prepare_exp_report(EXPERIMENT_RESULTS, 
                    exp_name9, 
                    parse_confusion_matrix(exp_confusion_matrix), 
                    parse_clf_report(exp_clf_report))

[[734  10]
 [ 21  38]]


                    0          1  accuracy   macro avg  weighted avg
precision    0.972185   0.791667  0.961395    0.881926      0.958922
recall       0.986559   0.644068  0.961395    0.815313      0.961395
f1-score     0.979320   0.710280  0.961395    0.844800      0.959552
support    744.000000  59.000000  0.961395  803.000000    803.000000


In [None]:
SCHEMA = ['Experiment Name', 'True Positives', 'True Negatives', 'False Positives', 
          'False Negatives', 'Overall F1 Score','Overall Precision','Overall Recall',
          'F1 for Positives Records','Precision for Positive Records', 'Recall for Positive Records']

pd.DataFrame(EXPERIMENT_RESULTS, columns=SCHEMA)

Unnamed: 0,Experiment Name,True Positives,True Negatives,False Positives,False Negatives,Overall F1 Score,Overall Precision,Overall Recall,F1 for Positives Records,Precision for Positive Records,Recall for Positive Records
0,Random Forest,742,31,28,2,0.957682,0.961855,0.96264,0.673913,0.939394,0.525424
1,Multinomial Naive Bayes,736,20,39,8,0.931639,0.932382,0.941469,0.45977,0.714286,0.338983
2,SVM,741,19,40,3,0.93487,0.942527,0.946451,0.469136,0.863636,0.322034
3,k-NN,736,21,38,8,0.933516,0.934243,0.942715,0.477273,0.724138,0.355932
4,Decision Tree,726,22,37,18,0.925366,0.922007,0.931507,0.444444,0.55,0.372881
5,Adaboost (Discrete SAMME),737,37,22,7,0.961436,0.961455,0.963885,0.718447,0.840909,0.627119
6,Adaboost (Real SAMME.R),739,34,25,5,0.959076,0.960262,0.96264,0.693878,0.871795,0.576271
7,Voting Classifier,736,34,25,8,0.955678,0.955567,0.958904,0.673267,0.809524,0.576271
8,GridSearch with VotingClassifier,734,38,21,10,0.959552,0.958922,0.961395,0.71028,0.791667,0.644068
9,GridSearch with VotingClassifier,734,38,21,10,0.959552,0.958922,0.961395,0.71028,0.791667,0.644068
