In [30]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import spacy
import scipy 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

In [31]:
def readfiles(path1,path2,path3):
    data = pd.read_csv(path1,header=0)
    with open(path2,'r') as f:       
        positive_words=[line.strip() for line in f]
    with open(path3,'r') as f: 
        negative_words=[line.strip() for line in f]
    return data,positive_words,negative_words


# lemmatize words and get word sentiment
def sent_words(text, nlp, positive_words, negative_words):   
    doc = nlp(text.lower())
    tokens=[]
    positive = []
    negative =[]
    tags =[]    
    for token in doc:            
        if not (token.is_punct or token.is_space ):            
            if token.lemma_ in positive_words:
                positive.append(token.lemma_)               
            if token.lemma_ in negative_words:
                negative.append(token.lemma_)            
            if (token.lemma_!='-PRON-'):
                tokens.append(token.lemma_)
            else:
                tokens.append(token.text)                
            tags.append(token.pos_)           
    return {"cleaned_text":" ".join(tokens), \
            "pos":len(positive),\
            "neg":len(negative)}


def Naive_Bayes_Model(dtm,metrics):
    # Run 5-fold cross validation to show the generalizability of the NB model
    clf = MultinomialNB()
    cv = cross_validate(clf, dtm, data["recommend_or_not"], \
                scoring=metrics, cv=5,)
    print("\nAverage testing performance of Naive Bayes benchmarking model:")
    print('precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('f1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('accuracy:', np.mean(np.array(cv['test_accuracy'])))

    
def SVM_Model(dtm,metrics):
    # Run 5-fold cross validation to show the generalizability of the SVM model
    clf = svm.LinearSVC()
    cv = cross_validate(clf, dtm, data["recommend_or_not"], \
                        scoring=metrics, cv=5)
    print("\nAverage testing performance of SVM benchmarking model:")
    print('precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('f1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('accuracy:', np.mean(np.array(cv['test_accuracy'])))


def DecisionTree_Model(dtm,metrics):
    # Run 5-fold cross validation to show the generalizability of the Decision Tree model
    clf = tree.DecisionTreeClassifier()
    cv = cross_validate(clf, dtm, data["recommend_or_not"], \
                        scoring=metrics, cv=5)
    print("\nAverage testing performance of Decision Tree benchmarking model:")
    print('precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('f1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('accuracy:', np.mean(np.array(cv['test_accuracy'])))

    
def KNN_Model(dtm,metrics):
    # Run 5-fold cross validation to show the generalizability of the KNN model
    clf = KNeighborsClassifier(n_neighbors=20)
    cv = cross_validate(clf, dtm, data["recommend_or_not"], \
                        scoring=metrics, cv=5)
    print("\nAverage testing performance of KNN benchmarking model:")
    print('precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('f1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('accuracy:', np.mean(np.array(cv['test_accuracy'])))
    

def neural_network_Model(dtm,metrics):
    # Run 5-fold cross validation to show the generalizability of the CNN model
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                   hidden_layer_sizes=(5, 2), random_state=1)
    cv = cross_validate(clf, dtm, data["recommend_or_not"], \
                        scoring=metrics, cv=5)
    print("\nAverage testing performance of Neural Network benchmarking model:")
    print('precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('f1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('accuracy:', np.mean(np.array(cv['test_accuracy'])))

    
def advanced_NB(dtm,metrics,data_new):
    # parameter tuning 
    text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB())])
    parameters = {'tfidf__min_df':[10,15,18],
                  'tfidf__stop_words':["english"],
                  'clf__alpha': [0.5,1.0,1.5],}
    metric =  "f1_macro"
    gs_clf = GridSearchCV(text_clf, param_grid=parameters, scoring=metric, cv=5)
    gs_clf = gs_clf.fit(data["review"], data["recommend_or_not"])
    print("\nBest parameters for Naive Bayes:")
    for param_name in gs_clf.best_params_:
        print(param_name,": ",gs_clf.best_params_[param_name])
    print("best f1_macro:", gs_clf.best_score_)
    
    # Run 5-fold cross validation to show the generalizability of the modified NB model
    tfidf_vect = TfidfVectorizer(stop_words='english',min_df=gs_clf.best_params_['tfidf__min_df']) 
    dtm2= tfidf_vect.fit_transform(data_new["cleaned_text"])
    dtm3=scipy.sparse.hstack([dtm2,data_new[["neg","pos"]].values])

    # parameter tuning + lemmatized text + word sentiment
    clf = MultinomialNB(alpha=gs_clf.best_params_['clf__alpha'])
    cv = cross_validate(clf, dtm3, data["recommend_or_not"], \
                    scoring=metrics, cv=5,)

    print("\nAverage performance of modified NB model:")
    print('Precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('Recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('F1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('Accuracy:', np.mean(np.array(cv['test_accuracy'])))
    
    
def advanced_SVM(dtm,metrics,data_new):
    # parameter tuning 
    text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', svm.LinearSVC())])
    parameters = {'tfidf__min_df':[1,2,5],
                  'tfidf__stop_words':["english"],
                  'clf__C': [1.0,1.5,2.0],}
    metric =  "f1_macro"
    gs_clf = GridSearchCV(text_clf, param_grid=parameters, scoring=metric, cv=5)
    gs_clf = gs_clf.fit(data["review"], data["recommend_or_not"])
    print("\nBest parameters for SVM:")
    for param_name in gs_clf.best_params_:
        print(param_name,": ",gs_clf.best_params_[param_name])
    print("best f1_macro:", gs_clf.best_score_)
    
    # Run 5-fold cross validation to show the generalizability of the modified SVM model
    tfidf_vect = TfidfVectorizer(stop_words='english',min_df=gs_clf.best_params_['tfidf__min_df']) 
    dtm2= tfidf_vect.fit_transform(data_new["cleaned_text"])
    dtm3=scipy.sparse.hstack([dtm2,data_new[["neg","pos"]].values])

    # parameter tuning + lemmatized text + word sentiment
    clf = svm.LinearSVC(C=gs_clf.best_params_['clf__C'])
    cv = cross_validate(clf, dtm3, data["recommend_or_not"], \
                    scoring=metrics, cv=5,)

    print("\nAverage performance of modified SVM model:")
    print('Precision_macro:', np.mean(np.array(cv['test_precision_macro'])))
    print('Recall_macro:', np.mean(np.array(cv['test_recall_macro'])))
    print('F1_macro:', np.mean(np.array(cv['test_f1_macro'])))
    print('Accuracy:', np.mean(np.array(cv['test_accuracy'])))
    
    
def run5models(data,positive_words,negative_words):
    tfidf_vect = TfidfVectorizer(stop_words='english') 
    dtm= tfidf_vect.fit_transform(data["review"])
    metrics =  ["precision_macro", "recall_macro","f1_macro","accuracy"] 
    Naive_Bayes_Model(dtm,metrics)
    SVM_Model(dtm,metrics)
    DecisionTree_Model(dtm,metrics)
    KNN_Model(dtm,metrics)
    neural_network_Model(dtm,metrics)
    
    nlp = spacy.load('en_core_web_sm')
    # lemmatize words and get pos/neg words
    data_new = data.review.apply(lambda s: \
        pd.Series(sent_words(s, nlp, positive_words, negative_words)))
    advanced_NB(dtm,metrics,data_new)
    advanced_SVM(dtm,metrics,data_new)

In [32]:
if __name__ == "__main__":
    path1 = "BIA-660_Project_Data_Subset.csv"         # you need to change path here #
    path2 = "positive-words.txt"                      # you need to change path here #
    path3 = "negative-words.txt"                      # you need to change path here #
    data,positive_words,negative_words=readfiles(path1,path2,path3)
    run5models(data,positive_words,negative_words)


Average testing performance of Naive Bayes benchmarking model:
precision_macro: 0.8933685132197031
recall_macro: 0.505149043740077
f1_macro: 0.46288797337694537
accuracy: 0.826902320561252

Average testing performance of SVM benchmarking model:
precision_macro: 0.8583517286454294
recall_macro: 0.7724811320904322
f1_macro: 0.8055278591517678
accuracy: 0.9007771181867241

Average testing performance of Decision Tree benchmarking model:
precision_macro: 0.7258732133813233
recall_macro: 0.7112573684302754
f1_macro: 0.7178115477527787
accuracy: 0.8421606853750674

Average testing performance of KNN benchmarking model:
precision_macro: 0.8574484092410332
recall_macro: 0.5857507790737317
f1_macro: 0.6048888041971707
accuracy: 0.8511612924986508

Average testing performance of Neural Network benchmarking model:
precision_macro: 0.8266447161780068
recall_macro: 0.767677365225045
f1_macro: 0.7918162395053189
accuracy: 0.8901237857528332

 Best parameters for Naive Bayes:
clf__alpha :  0.5
tfidf




Average performance of modified SVM model:
Precision_macro: 0.841503954019245
Recall_macro: 0.7767752610232522
F1_macro: 0.8032095192137012
Accuracy: 0.8969201969778737
