In [1]:
import string
import numpy as np
import pandas as pd
import nltk
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from HappyFunTokenizer import HappyFunTokenizer
from sklearn_pandas import DataFrameMapper
import os 
import spacy

In [19]:
def get_rel_path(file_name):
    absolutepath = os.path.abspath('')
    #print(absolutepath)
    fileDirectory = os.path.dirname(absolutepath)
    file_path = os.path.join(fileDirectory, 'Data/' + str(file_name))   
    #print(file_path)
    return file_path

In [21]:
# load preprocessed dataset from csv
file = get_rel_path("data.csv")
df_data = pd.read_csv(file)

In [22]:
from sklearn.utils import shuffle
df_data = shuffle(df_data)
df_data.reset_index(inplace=True, drop=True)
df_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27994,27995,27996,27997,27998,27999,Sentiment,Pos:Neg Ratio,Length,Noun Phrases
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3.137931,491,112
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.032258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.243902,856,196
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.035964,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,3.250000,345,86
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.343066,783,187
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.049475,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.541667,909,217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.0,0.0,0.0,0.0,0.0,0.000000,0.066302,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.613636,663,140
1996,0.0,0.0,0.0,0.0,0.0,0.021153,0.021478,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.863636,899,217
1997,0.0,0.0,0.0,0.0,0.0,0.000000,0.032394,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,0.733119,2039,479
1998,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,1.644068,634,157


# Model Training & Evaluation

In [14]:
# Define a function to take as input training and testing vectors and labels
# Allow this to be extensible to let multiple classifiers be used here
def buildClassifiers(clf, X_train, X_test, y_train, y_test):

    #clf.fit(X_train.todense(), y_train)
    #y_pred = clf.predict(X_test.todense())
    
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    precision = precision_score(y_test, y_pred, average="macro", zero_division=0)
    recall = recall_score(y_test, y_pred, average="macro", zero_division=0)

    return f1, precision, recall

def train_and_evaluate(X_train, X_test, y_train, y_test):
    names = ['Naive_Bayes', 'Decision_Tree', 'Linear SVM', 'Random Forest']
    classifiers = [GaussianNB(), 
                   DecisionTreeClassifier(random_state=42),
                  LinearSVC(dual=True, C= 0.024, max_iter=3000),
                  #RandomForestClassifier(max_depth=2, random_state=0)
                   RandomForestClassifier()
                  ]
    aList, bList, cList = list(), list(), list()
    for name, clf in zip(names, classifiers):
        print('Now classifying', name)
        f1, precision, recall = buildClassifiers(clf, X_train, X_test,  
                                                     y_train, y_test)
        aList.append(f1)
        bList.append(precision)
        cList.append(recall)

        print("\tF1 for {}:\t\t".format(name), np.mean(aList))
        print("\tPrecision for {}:\t".format(name), np.mean(bList))
        print("\tRecall for {}:\t\t".format(name), np.mean(cList))
        print()
    return aList, bList, cList

## Baseline
    X = tf-idf matrix (negation)

In [11]:
X = df_data.drop(['Sentiment', 'Pos:Neg Ratio', 'Length', 'Noun Phrases'], axis = 1)
y = df_data['Sentiment']
X.shape

(2000, 28000)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)

overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train, X_test, 
                                                                   y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.6828861727086961
	Precision for Naive_Bayes:	 0.6880573616715068
	Recall for Naive_Bayes:		 0.6830519074421513

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6452237552088567
	Precision for Decision_Tree:	 0.6477786808357533
	Recall for Decision_Tree:		 0.6453095684803002

Now classifying Linear SVM
	F1 for Linear SVM:		 0.7067818291541829
	Precision for Linear SVM:	 0.7090827421792424
	Recall for Linear SVM:		 0.7071711486345632

Now classifying Random Forest
	F1 for Random Forest:		 0.7274759866437404
	Precision for Random Forest:	 0.7302719067820137
	Recall for Random Forest:		 0.7282207629768604



## IMPROVED SYSTEM
    Features:
        - tf-idf matrix (negation)
        - pos:neg words ratio
        - review length 
        - # of noun phrases 

### Negation + Pos:Neg Ratio 

In [16]:
df = df_data.drop(['Sentiment', 'Length', 'Noun Phrases'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.6828861727086961
	Precision for Naive_Bayes:	 0.6880573616715068
	Recall for Naive_Bayes:		 0.6830519074421513

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6576482499333784
	Precision for Decision_Tree:	 0.6601344500665226
	Recall for Decision_Tree:		 0.657567229518449

Now classifying Linear SVM
	F1 for Linear SVM:		 0.665830275513127
	Precision for Linear SVM:	 0.6749494932375416
	Recall for Linear SVM:		 0.6686679174484053

Now classifying Random Forest
	F1 for Random Forest:		 0.6999856708015796
	Precision for Random Forest:	 0.7070835141589253
	Recall for Random Forest:		 0.7022983114446528



### Negation + Pos:Neg Ratio + Noun Phrases

In [18]:
df = df_data.drop(['Sentiment', 'Length'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.7124299179328837
	Precision for Naive_Bayes:	 0.7133413461538461
	Recall for Naive_Bayes:		 0.7131332082551595

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.6811887083101755
	Precision for Decision_Tree:	 0.6819816790592514
	Recall for Decision_Tree:		 0.6818323952470293

Now classifying Linear SVM




	F1 for Linear SVM:		 0.6822591703645541
	Precision for Linear SVM:	 0.6872933415950566
	Recall for Linear SVM:		 0.6845945382530747

Now classifying Random Forest
	F1 for Random Forest:		 0.7077747217199072
	Precision for Random Forest:	 0.713039107561852
	Recall for Random Forest:		 0.7101000625390868



### All features: Negation + Pos:Neg Ratio + Review length + Noun phrases

In [15]:
df = df_data.drop(['Sentiment'], axis =1)
X = df.to_numpy()
y = df_data["Sentiment"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,random_state=109)
overall_f1, overall_precision, overall_recall = train_and_evaluate(X_train, X_test, y_train, y_test)

Now classifying Naive_Bayes
	F1 for Naive_Bayes:		 0.791653400936331
	Precision for Naive_Bayes:	 0.7991811616315736
	Recall for Naive_Bayes:		 0.7943089430894309

Now classifying Decision_Tree
	F1 for Decision_Tree:		 0.7107618710521308
	Precision for Decision_Tree:	 0.7144406522103608
	Recall for Decision_Tree:		 0.7119136960600376

Now classifying Linear SVM




	F1 for Linear SVM:		 0.6150037343064362
	Precision for Linear SVM:	 0.6771826570291294
	Recall for Linear SVM:		 0.6493016468626225

Now classifying Random Forest
	F1 for Random Forest:		 0.6580510434633945
	Precision for Random Forest:	 0.7054765660407681
	Recall for Random Forest:		 0.6841463414634146

