In [26]:
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from HappyFunTokenizer import HappyFunTokenizer
from sklearn_pandas import DataFrameMapper
import os 
import spacy

In [19]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [21]:
# load preprocessed dataset from csv
file = get_rel_path("data.csv")
df_data = pd.read_csv(file)

In [22]:
from sklearn.utils import shuffle
df_data = shuffle(df_data)
df_data.reset_index(inplace=True, drop=True)
df_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27994,27995,27996,27997,27998,27999,Sentiment,Pos:Neg Ratio,Length,Noun Phrases
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3.137931,491,112
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.032258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.243902,856,196
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.035964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3.250000,345,86
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.343066,783,187
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.049475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.541667,909,217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.000000,0.066302,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.613636,663,140
1996,0.0,0.0,0.0,0.0,0.0,0.021153,0.021478,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.863636,899,217
1997,0.0,0.0,0.0,0.0,0.0,0.000000,0.032394,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.733119,2039,479
1998,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.644068,634,157


# Model Training & Evaluation

In [29]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    #clf.fit(X_train.todense(), y_train)
    #y_pred = clf.predict(X_test.todense())
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)

    return f1, precision, recall, accuracy

def train_and_evaluate(X_train, X_test, y_train, y_test):
    names = ['Naive_Bayes', 'Decision_Tree', 'Linear SVM', 'Random Forest']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42),
                  LinearSVC(dual=True, C= 0.024, max_iter=3000),
                  #RandomForestClassifier(max_depth=2, random_state=0)
                   RandomForestClassifier()
                  ]
    aList, bList, cList, dList = list(), list(), list(), list()
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        f1, precision, recall, accuracy = buildClassifiers(clf, X_train, X_test,  
                                                     y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)
        dList.append(accuracy)

        print("\tF1 for {}:\t\t".format(name), np.mean(aList))
        print("\tPrecision for {}:\t".format(name), np.mean(bList))
        print("\tRecall for {}:\t\t".format(name), np.mean(cList))
        print("\tAccuracy for {}:\t\t".format(name), np.mean(dList))
        print()
    return aList, bList, cList, dList

## Baseline
    X = tf-idf matrix (negation)

In [40]:
X = df_data.drop(['Sentiment', 'Pos:Neg Ratio', 'Length', 'Noun Phrases'], axis = 1)
y = df_data['Sentiment']
X.shape

(2000, 28000)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)

overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, 
                                                                   y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.6864666981577705
	Precision for Naive_Bayes:	 0.6873015873015873
	Recall for Naive_Bayes:		 0.6848370927318296
	Accuracy for Naive_Bayes:		 0.6875

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6383615542070904
	Precision for Decision_Tree:	 0.6386507936507937
	Recall for Decision_Tree:		 0.637531328320802
	Accuracy for Decision_Tree:		 0.6387499999999999

Now classifying Linear SVM
	F1 for Linear SVM:		 0.6838345613478992
	Precision for Linear SVM:	 0.6853619354343993
	Recall for Linear SVM:		 0.684252297410192
	Accuracy for Linear SVM:		 0.6841666666666666

Now classifying Random Forest
	F1 for Random Forest:		 0.6952137750795861
	Precision for Random Forest:	 0.6979216439029581
	Recall for Random Forest:		 0.6965225563909774
	Accuracy for Random Forest:		 0.6956249999999999



## IMPROVED SYSTEM
    Features:
        - tf-idf matrix (negation)
        - pos:neg words ratio
        - review length 
        - # of noun phrases 

### Negation + Pos:Neg Ratio 

In [42]:
df = df_data.drop(['Sentiment', 'Length', 'Noun Phrases'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.6864666981577705
	Precision for Naive_Bayes:	 0.6873015873015873
	Recall for Naive_Bayes:		 0.6848370927318296
	Accuracy for Naive_Bayes:		 0.6875

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6357333490788852
	Precision for Decision_Tree:	 0.6368838763575606
	Recall for Decision_Tree:		 0.6356516290726817
	Accuracy for Decision_Tree:		 0.63625

Now classifying Linear SVM
	F1 for Linear SVM:		 0.669112912529379
	Precision for Linear SVM:	 0.6740516993986111
	Recall for Linear SVM:		 0.6711779448621554
	Accuracy for Linear SVM:		 0.6699999999999999

Now classifying Random Forest
	F1 for Random Forest:		 0.6890669547444879
	Precision for Random Forest:	 0.6951807574130955
	Recall for Random Forest:		 0.6919172932330828
	Accuracy for Random Forest:		 0.69



### Negation + Pos:Neg Ratio + Noun Phrases

In [43]:
df = df_data.drop(['Sentiment', 'Length'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7501563477173235
	Precision for Naive_Bayes:	 0.75
	Recall for Naive_Bayes:		 0.7506265664160401
	Accuracy for Naive_Bayes:		 0.75

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6926740079213258
	Precision for Decision_Tree:	 0.6927771108443377
	Recall for Decision_Tree:		 0.693233082706767
	Accuracy for Decision_Tree:		 0.6925

Now classifying Linear SVM




	F1 for Linear SVM:		 0.6054135413106375
	Precision for Linear SVM:	 0.695283882366653
	Recall for Linear SVM:		 0.6468253968253969
	Accuracy for Linear SVM:		 0.6391666666666667

Now classifying Random Forest
	F1 for Random Forest:		 0.6431537515329377
	Precision for Random Forest:	 0.7131461774355019
	Recall for Random Forest:		 0.6755639097744361
	Accuracy for Random Forest:		 0.66875



### All features: Negation + Pos:Neg Ratio + Review length + Noun phrases

In [44]:
df = df_data.drop(['Sentiment'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall, overall_accuracy = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7520960611309474
	Precision for Naive_Bayes:	 0.7571961956658846
	Recall for Naive_Bayes:		 0.7555137844611528
	Accuracy for Naive_Bayes:		 0.7525

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6919061169933347
	Precision for Decision_Tree:	 0.6969137782090893
	Recall for Decision_Tree:		 0.6954887218045113
	Accuracy for Decision_Tree:		 0.6924999999999999

Now classifying Linear SVM




	F1 for Linear SVM:		 0.5632481457921666
	Precision for Linear SVM:	 0.5437758521393928
	Recall for Linear SVM:		 0.6303258145363408
	Accuracy for Linear SVM:		 0.62

Now classifying Random Forest
	F1 for Random Forest:		 0.6185852650664514
	Precision for Random Forest:	 0.6054328992055548
	Recall for Random Forest:		 0.6697994987468672
	Accuracy for Random Forest:		 0.66125

