In [15]:
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from HappyFunTokenizer import HappyFunTokenizer
from sklearn_pandas import DataFrameMapper
import os 
import spacy

In [13]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/ExtraCredit/' + str(file_name))   
    #print(file_path)
    return file_path

In [2]:
# load preprocessed dataset from csv
file = get_rel_path("data_extracredit.csv")
df_data = pd.read_csv(file)

In [3]:
from sklearn.utils import shuffle
df_data = shuffle(df_data)
df_data.reset_index(inplace=True, drop=True)

In [4]:
df_data = df_data.dropna()
df_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14549,14550,14551,14552,14553,14554,Sentiment,Pos:Neg Ratio,Length,Noun Phrases
0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,6.000000,40.0,7
2,0.045057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.777778,118.0,30
3,0.152409,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,4.666667,62.0,17
5,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,14.000000,247.0,55
6,0.025640,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,11.285714,346.0,78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10386,0.097558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,2.625000,121.0,27
10387,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.800000,22.0,4
10388,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,6.000000,76.0,21
10389,0.241159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,1.500000,33.0,12


# Model Training & Evaluation

In [18]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    #clf.fit(X_train.todense(), y_train)
    #y_pred = clf.predict(X_test.todense())
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    return f1, precision, recall, accuracy

def train_and_evaluate(X_train, X_test, y_train, y_test):
    names = ['Naive_Bayes', 'Decision_Tree', 'Linear SVM', 'Random Forest']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42),
                  LinearSVC(dual=True, C= 0.024, max_iter=3000),
                  #RandomForestClassifier(max_depth=2, random_state=0)
                   RandomForestClassifier()
                  ]
    aList, bList, cList, dList = list(), list(), list(), list()
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        f1, precision, recall, accuracy = buildClassifiers(clf, X_train, X_test,  
                                                     y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)

        print("\tF1 for {}:\t\t".format(name), np.mean(aList))
        print("\tPrecision for {}:\t".format(name), np.mean(bList))
        print("\tRecall for {}:\t\t".format(name), np.mean(cList))
        print("\tAccuracy for {}:\t\t".format(name), np.mean(dList))
        print()
    return aList, bList, cList, dList

## Baseline
    X = tf-idf matrix (negation)

In [20]:
X = df_data.drop(['Sentiment', 'Pos:Neg Ratio', 'Length', 'Noun Phrases'], axis = 1)
y = df_data['Sentiment']
X.shape

(8781, 14555)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)

overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, 
                                                                   y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7309179086590009
	Precision for Naive_Bayes:	 0.73264745458623
	Recall for Naive_Bayes:		 0.7272009073487427
	Accuracy for Naive_Bayes:		 0.732498577120091

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.7554937493581644
	Precision for Decision_Tree:	 0.7560365842619606
	Recall for Decision_Tree:		 0.7541577039905094
	Accuracy for Decision_Tree:		 0.7561183836084234

Now classifying Linear SVM
	F1 for Linear SVM:		 0.7843032020682474
	Precision for Linear SVM:	 0.7850356605038273
	Recall for Linear SVM:		 0.7825622172683068
	Accuracy for Linear SVM:		 0.7848605577689242

Now classifying Random Forest
	F1 for Random Forest:		 0.8000987426811885
	Precision for Random Forest:	 0.801177143784245
	Recall for Random Forest:		 0.7980621064570379
	Accuracy for Random Forest:		 0.8006545247581103



## IMPROVED SYSTEM
    Features:
        - tf-idf matrix (negation)
        - pos:neg words ratio
        - review length 
        - # of noun phrases 

### Negation + Pos:Neg Ratio 

In [22]:
df = df_data.drop(['Sentiment', 'Length', 'Noun Phrases'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7479565156440133
	Precision for Naive_Bayes:	 0.7472519100013661
	Recall for Naive_Bayes:		 0.7453778664267929
	Accuracy for Naive_Bayes:		 0.7484348321001707

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.7478103787472419
	Precision for Decision_Tree:	 0.7469860784920356
	Recall for Decision_Tree:		 0.7467698515128998
	Accuracy for Decision_Tree:		 0.7478656801365964

Now classifying Linear SVM
	F1 for Linear SVM:		 0.77022871355725
	Precision for Linear SVM:	 0.7692731576992697
	Recall for Linear SVM:		 0.7692190195592753
	Accuracy for Linear SVM:		 0.7702523240371845

Now classifying Random Forest
	F1 for Random Forest:		 0.7921248856973422
	Precision for Random Forest:	 0.7918975697275566
	Recall for Random Forest:		 0.790676209472408
	Accuracy for Random Forest:		 0.7922595332953898



### Negation + Pos:Neg Ratio + Noun Phrases

In [23]:
df = df_data.drop(['Sentiment', 'Length'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7580544167705129
	Precision for Naive_Bayes:	 0.7566450493240775
	Recall for Naive_Bayes:		 0.7564074986637465
	Accuracy for Naive_Bayes:		 0.7581104154809334

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.7562714079852527
	Precision for Decision_Tree:	 0.7551340061694402
	Recall for Decision_Tree:		 0.7557651192198886
	Accuracy for Decision_Tree:		 0.7561183836084234

Now classifying Linear SVM




	F1 for Linear SVM:		 0.776248752776073
	Precision for Linear SVM:	 0.7750865369825887
	Recall for Linear SVM:		 0.7755976203823206
	Accuracy for Linear SVM:		 0.7761335609941188

Now classifying Random Forest
	F1 for Random Forest:		 0.7945015452784358
	Precision for Random Forest:	 0.7940649218853776
	Recall for Random Forest:		 0.7933239469670303
	Accuracy for Random Forest:		 0.794536141149687



### All features: Negation + Pos:Neg Ratio + Review length + Noun phrases

In [24]:
df = df_data.drop(['Sentiment'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.794264275209732
	Precision for Naive_Bayes:	 0.7937137951823019
	Recall for Naive_Bayes:		 0.7921630359680343
	Accuracy for Naive_Bayes:		 0.794536141149687

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.7766423630707935
	Precision for Decision_Tree:	 0.7758433770705526
	Recall for Decision_Tree:		 0.7757994707132335
	Accuracy for Decision_Tree:		 0.7766078542970973

Now classifying Linear SVM




	F1 for Linear SVM:		 0.7899543170267979
	Precision for Linear SVM:	 0.7900541047156797
	Recall for Linear SVM:		 0.7883533010894269
	Accuracy for Linear SVM:		 0.7901726427622843

Now classifying Random Forest
	F1 for Random Forest:		 0.8046288777035132
	Precision for Random Forest:	 0.8051963927361755
	Recall for Random Forest:		 0.8027140612981867
	Accuracy for Random Forest:		 0.8049231644849175

