In [1]:
from google.colab import drive
drive.mount("/gdrive")
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Imports and other pre-reqs

In [2]:
!pip install spacy
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

import spacy
nlp = spacy.load("en_core_web_lg", disable = ['parser', 'ner'])

## Load  data

In [0]:
# read the data into a pandas dataframe
def data2df (path, label):
    file, text = [], []
    for f in os.listdir(path):
        file.append(f)
        with open(os.path.join(path,f), 'r', encoding='utf-8', errors='ignore') as fhr:
            t = fhr.read()
            text.append(t)
    return(pd.DataFrame({'file': file, 'text': text, 'class':label}))

In [0]:
dfneg = data2df("/gdrive/My Drive/CIS_508/Colab Notebooks/CIS_509/Asmt5/HealthProNonPro/NonPro", 0) # NEG
dfpos = data2df("/gdrive/My Drive/CIS_508/Colab Notebooks/CIS_509/Asmt5/HealthProNonPro/Pro", 1) # POS

df = pd.concat([dfpos, dfneg], axis=0)

In [6]:
df.sample(10)

Unnamed: 0,file,text,class
1790,ans171.txt,Calcification of the kidneys or nephrocalcinos...,1
1609,a24591.txt,He could have the male equivalent of a yeast i...,0
38,a31582.txt,lose weight by eating only healthy foods (frui...,0
606,ans645.txt,In the view that you have previous history of ...,1
586,ans626.txt,"In most programs, donation is restricted to fa...",1
1282,ans1254.txt,Thank you for your question. There is no signi...,1
386,a54316.txt,hamburgers and french fries work for me,0
731,a69421.txt,Good luck.\n\nDont forget to bring lots of boo...,0
922,ans924.txt,Nausea and vomiting commonly occurs between 5-...,1
816,a69500.txt,depends on what you did to make you think you ...,0


In [0]:
# setup the data
X, y = df['text'], df['class']

## Preprocessing using Spacy

In [0]:
def customtokenizer(doc):
  tokens = [token.lemma_.lower() 
              for token in doc 
                if (
                    len(token) >= 3 and
                    len(token) <= 60 and
                    not token.is_punct and
                    not token.is_space and
                    not token.is_stop
                )
              ]
  return " ".join(tokens)

In [0]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=1)

Xtrain = Xtrain.copy()
Xtest = Xtest.copy()
ytrain = ytrain.copy()
ytest = ytest.copy()

In [11]:
XtrainCorpus = nlp.pipe(Xtrain)
cleanXtrainCorpus = [customtokenizer(doc) for doc in XtrainCorpus]
len(cleanXtrainCorpus), len(ytrain)

(2928, 2928)

In [12]:
XtestCorpus = nlp.pipe(Xtest)
cleanXtestCorpus = [customtokenizer(doc) for doc in XtestCorpus]
len(cleanXtestCorpus), len(ytest)

(733, 733)

## Model building and evaluation

In [0]:
tfidf = TfidfVectorizer(
        binary=False, sublinear_tf=False, # tf - bow
        use_idf=True, smooth_idf=True, # idf  - with smoothing
        norm='l2', # tfidf - l2 norm
        lowercase=True, stop_words='english', 
        #token_pattern='(?u)\\b\\w\\w+\\b', 
        min_df=1, max_df=1.0, max_features=None, 
        ngram_range=(1, 1)
    )

nb = MultinomialNB(
        alpha=1.0, # laplace add-one smoothing
        fit_prior=True, # learn class prior-probabilities from data
        class_prior=None # none - go with whatever fit-prior says
    )

In [0]:
clf = Pipeline(steps = [
                        ('tfidf', tfidf),
                        ('nb', nb)
                        ])

In [15]:
clf.fit(cleanXtrainCorpus, ytrain)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nb',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
ypred = clf.predict(cleanXtestCorpus)

In [17]:
print(confusion_matrix(ytest,ypred))
print(classification_report(ytest, ypred))
print(accuracy_score(ytest, ypred))

[[313  45]
 [  2 373]]
              precision    recall  f1-score   support

           0       0.99      0.87      0.93       358
           1       0.89      0.99      0.94       375

    accuracy                           0.94       733
   macro avg       0.94      0.93      0.94       733
weighted avg       0.94      0.94      0.94       733

0.9358799454297408


In [0]:
def calc_metrics(TN, FP, FN, TP):

  acc = (TN + TP)/(TN + FP + FN + TP)

  #Class 0 = NEG class = NonPro
  #Class 1 = POS class = Pro

  c0_prec = TN/(TN + FN)                
  c1_prec = TP/(TP + FP)                
  c0_recall = TN/(TN + FP)
  c1_recall = TP/(TP + FN)
  c0_f1 = (2 * c0_prec * c0_recall)/(c0_prec + c0_recall) 
  c1_f1 = (2 * c1_prec * c1_recall)/(c1_prec + c1_recall)
  
  print("\n Overall accuracy:", round(acc,2))
  print("_"*45)
  df = pd.DataFrame({"Class":     ['NonPro', 'Pro'], 
                     "Precision": [round(c0_prec,2) , round(c1_prec,2) ], 
                     "Recall":    [round(c0_recall,2) , round(c1_recall,2) ], 
                     "F1-score":  [round(c0_f1,2) , round(c1_f1,2) ]}, index = None)
  print("\n",df)
  return

In [19]:
TN, FP, FN, TP = confusion_matrix(y_true = ytest, y_pred = ypred).ravel()
calc_metrics(TN, FP, FN, TP)


 Overall accuracy: 0.94
_____________________________________________

     Class  Precision  Recall  F1-score
0  NonPro       0.99    0.87      0.93
1     Pro       0.89    0.99      0.94


## HyperParametertuning with GridSearch

In [0]:
param_grid = {
    'tfidf__norm':['l2'],
    'tfidf__sublinear_tf':['True'],
    'tfidf__ngram_range':[(1,1), (1,2), (1,3)],
    'nb__alpha': [1.0, 0.1, 0.01] 
}

gscv = GridSearchCV(clf,
                    param_grid,
                    cv = 4,
                    return_train_score = False)

In [31]:
gscv.fit(cleanXtrainCorpus, ytrain)
gscv.best_estimator_ , gscv.best_params_, gscv.best_score_

(Pipeline(memory=None,
          steps=[('tfidf',
                  TfidfVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.float64'>,
                                  encoding='utf-8', input='content',
                                  lowercase=True, max_df=1.0, max_features=None,
                                  min_df=1, ngram_range=(1, 1), norm='l2',
                                  preprocessor=None, smooth_idf=True,
                                  stop_words='english', strip_accents=None,
                                  sublinear_tf='True',
                                  token_pattern='(?u)\\b\\w\\w+\\b',
                                  tokenizer=None, use_idf=True,
                                  vocabulary=None)),
                 ('nb',
                  MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))],
          verbose=False),
 {'nb__alpha': 0.1,
 

In [0]:
ypred1 = gscv.best_estimator_.predict(cleanXtestCorpus)

In [25]:
print(confusion_matrix(ytest,ypred1))
print(classification_report(ytest, ypred1))
print(accuracy_score(ytest, ypred1))

[[319  39]
 [  3 372]]
              precision    recall  f1-score   support

           0       0.99      0.89      0.94       358
           1       0.91      0.99      0.95       375

    accuracy                           0.94       733
   macro avg       0.95      0.94      0.94       733
weighted avg       0.95      0.94      0.94       733

0.9427012278308322


In [26]:
TN, FP, FN, TP = confusion_matrix(y_true = ytest, y_pred = ypred1).ravel()
calc_metrics(TN, FP, FN, TP)


 Overall accuracy: 0.94
_____________________________________________

     Class  Precision  Recall  F1-score
0  NonPro       0.99    0.89      0.94
1     Pro       0.91    0.99      0.95
