## TEXT MINING C. Sc. 83040 : FINAL PROJECT

### SENTIMENT ANALYSIS OF YELP DATASET
#### TEAM MEMBERS : VISHAL BHARTI

In [2]:
## Import the libraries

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import gensim
from nltk.corpus import stopwords
from collections import Counter
from nltk import word_tokenize
from timeit import default_timer as timer
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from scipy.sparse import vstack
import itertools
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, BatchNormalization
from keras.metrics import categorical_accuracy
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from nltk import word_tokenize
import scipy.sparse
from nltk.sentiment.util import mark_negation
import plotly.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import Scatter, Figure, Layout, Bar

init_notebook_mode(connected=True)

Using TensorFlow backend.


In [71]:
# read the pre-processed data
df_final = pd.read_pickle("final_df.pkl")

In [3]:
df_final.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool,rating,name,latitude,longitude,city,state
299999,vXMQhG5JtNwSRb6iql7iig,GK07iEy8UllYo113DlNnww,HrG_BxmOMPbqstycmzORzw,1,2013-01-09,They did some transmission work for me that in...,3,2,1,negative,"""Ken's Transmission""",33.640787,-112.02529,Phoenix,AZ
254005,W_rauRWpM3ok_diREwEM2A,sjzv-c1k_HGGT9vZbfimWw,HONwpNQ2fmwMTOIZu0VI1A,1,2014-03-15,Poor service when it comes to scheduling. Base...,0,0,0,negative,"""Integrative Family Medicine Limited""",36.078796,-115.242975,Las Vegas,NV
110374,h2F3EgNUdcggV8XrW2VQdg,KL-JE4VkGW02LzeSlW3e6Q,NWlNMG_eBIvDjCcHK46eDQ,2,2015-10-14,"Went on a Tuesday night, and it was really emp...",2,1,1,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ
110373,L3r_OGsUqObVUEyP9uR_Bw,AlYZFOW_Xqi0qXelUrrHVw,NWlNMG_eBIvDjCcHK46eDQ,2,2015-12-13,"Meh\n\nI can imagine going here for the game, ...",3,2,0,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ
110372,u0LWSgqpthGe3R5YlWR6mw,hNY3RdZK7dT43dznSxiA5A,NWlNMG_eBIvDjCcHK46eDQ,2,2016-11-19,Went here last night for dinner. Service was l...,0,0,0,negative,"""The Haymaker Restaurant""",33.640232,-111.979545,Phoenix,AZ


## Classification into three classes 'positive', 'neutral' and 'negative'

#### NOTE : 
* Each classifier takes around 10 mins to run for cases where the negations are not handled and round 60 mins when negations are handled (including the times for generating TFIDF vectors while handling negations). 
* To fetch the saved results, the cells following the classifier function call can be executed. This cells need not be run when running the classifiers, as the results are returned by the functions. The function calls that load the saved results assume that the 'Results' directory is inside current working directory.
* The results in the write and those in some of these cells might have slight variation due to the random components in the algorithms. 

---
### 1. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams (U), stop words removed (SWR), punctuations removed (PR) (tfidfvectorizer default setting) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [72]:
# Note : Takes around 10 mins to run
## Function that runs the classifiers on unigram features with stop words and punctuations removed
def runClassifiersU(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("\nRunning classifiers for feature size : {}".format(MaxFeatures))
        # Initialize the tfidf vectorizer 
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, norm='l2')
        # Run the vectorizer on the yelp reviews
        tokens = tfidf_vect.fit_transform(df_final["text"].tolist())
        labels = df_final["rating"].tolist()

        # Split into training and test set 70:30 split
        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        
        # if showRes is true the confusion matrix for each run is printed
        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logistic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [73]:
start_time = timer()
nbdict, svmdict, lrdict, mlpdict = runClassifiersU()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))


Running classifiers for feature size : 500

Running classifiers for feature size : 1000

Running classifiers for feature size : 1500

Running classifiers for feature size : 2500

Running classifiers for feature size : 5000
Finished in : 673.2 seconds


In [3]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\nbdict.npy", nbdict)
# np.save("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\svmdict.npy", svmdict)
# np.save("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\lrdict.npy", lrdict)
# np.save("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\mlpdict.npy", mlpdict)

## To load results from file. These are in the same format as the results from the above function call.
nbdict = np.load("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\nbdict.npy").item()
svmdict = np.load("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\svmdict.npy").item()
lrdict = np.load("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\lrdict.npy").item()
mlpdict = np.load("D:\\Text Mining\\Final Project\\Results\\U_SWR_PR\\mlpdict.npy").item()

#### Plots for accuracy, training time, test time and f1-score for the used classifiers

In [74]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][0]*100, nbdict['1000'][0]*100, nbdict['1500'][0]*100, nbdict['2500'][0]*100, nbdict['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][0]*100, svmdict['1000'][0]*100, svmdict['1500'][0]*100, svmdict['2500'][0]*100, svmdict['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][0]*100, lrdict['1000'][0]*100, lrdict['1500'][0]*100, lrdict['2500'][0]*100, lrdict['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][0]*100, mlpdict['1000'][0]*100, mlpdict['1500'][0]*100, mlpdict['2500'][0]*100, mlpdict['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [6]:
## PLOT THE TRAINING TIME
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][4], nbdict['1000'][4], nbdict['1500'][4], nbdict['2500'][4], nbdict['5000'][4]],
    x = [500,1000,1500,2500,5000],
    legendgroup="1",
    mode = 'lines+markers',
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
    #5799C7
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][4], svmdict['1000'][4], svmdict['1500'][4], svmdict['2500'][4], svmdict['5000'][4]],
    legendgroup="2",
    mode = 'lines+markers',
    name = 'Linear SVM',
    marker= {'color': 'rgb(255, 127, 14)'}
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][4], lrdict['1000'][4], lrdict['1500'][4], lrdict['2500'][4], lrdict['5000'][4]],
    mode = 'lines+markers',
    legendgroup="3",
    name = 'Logistic Regression',
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][4], mlpdict['1000'][4], mlpdict['1500'][4], mlpdict['2500'][4], mlpdict['5000'][4]],
    mode = 'lines+markers',
    legendgroup="4",
    name = 'Neural net',
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of training time for different algorithms (U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Training time in seconds (log-scale)',
        type='log',
        ticklen= 5,
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [7]:
## PLOT THE TESTING TIME
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][5], nbdict['1000'][5], nbdict['1500'][5], nbdict['2500'][5], nbdict['5000'][5]],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][5], svmdict['1000'][5], svmdict['1500'][5], svmdict['2500'][5], svmdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][5], lrdict['1000'][5], lrdict['1500'][5], lrdict['2500'][5], lrdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][5], mlpdict['1000'][5], mlpdict['1500'][5], mlpdict['2500'][5], mlpdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of testing time for different algorithms (U+SWR+PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Testing time in seconds (log-scale)',
        ticklen= 5,
        gridwidth= 2,
        type='log'
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [8]:
## PLOT THE F1-SCORE
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][2]*100, nbdict['1000'][3]*100, nbdict['1500'][3]*100, nbdict['2500'][3]*100, nbdict['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][3]*100, svmdict['1000'][3]*100, svmdict['1500'][3]*100, svmdict['2500'][3]*100, svmdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][3]*100, lrdict['1000'][3]*100, lrdict['1500'][3]*100, lrdict['2500'][3]*100, lrdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][3]*100, mlpdict['1000'][3]*100, mlpdict['1500'][3]*100, mlpdict['2500'][3]*100, mlpdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (U+SWR+PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 2. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams, stop words removed, punctuations removed (tfidfvectorizer default setting), setting the feature size to 5000 and varying the minimum document frequency threshold in 5,10,15 and 20.

In [77]:
def runClassifiers_df(showRes = False):
    df_array = [5,10,15,20]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for mdf in df_array:
        print("Running classifiers with min document frequency threshold : {}".format(mdf))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = 5000, norm='l2', min_df=mdf)
        tokens = tfidf_vect.fit_transform(df_final["text"].tolist())
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [78]:
start_time = timer()
nbdict2, svmdict2, lrdict2, mlpdict2 = runClassifiers_df()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers with min document frequency threshold : 5
Running classifiers with min document frequency threshold : 10
Running classifiers with min document frequency threshold : 15
Running classifiers with min document frequency threshold : 20
Finished in : 629.22 seconds


In [9]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# np.save("Results\\U_SWR_PR_DF\\nbdict.npy", nbdict2)
# np.save("Results\\U_SWR_PR_DF\\svmdict.npy", svmdict2)
# np.save("Results\\U_SWR_PR_DF\\lrdict.npy", lrdict2)
# np.save("Results\\U_SWR_PR_DF\\mlpdict.npy", mlpdict2)

## to load results from file
nbdict2 = np.load("Results\\U_SWR_PR_DF\\nbdict.npy").item()
svmdict2 = np.load("Results\\U_SWR_PR_DF\\svmdict.npy").item()
lrdict2 = np.load("Results\\U_SWR_PR_DF\\lrdict.npy").item()
mlpdict2 = np.load("Results\\U_SWR_PR_DF\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [10]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict2['5'][0]*100, nbdict2['10'][0]*100, nbdict2['15'][0]*100, nbdict2['20'][0]*100],
    x = [5,10,15,20],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [5,10,15,20],
    y = [svmdict2['5'][0]*100, svmdict2['10'][0]*100, svmdict2['15'][0]*100, svmdict2['20'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [5,10,15,20],
    y = [lrdict2['5'][0]*100, lrdict2['10'][0]*100, lrdict2['15'][0]*100, lrdict2['20'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [5,10,15,20],
    y = [mlpdict2['5'][0]*100, mlpdict2['10'][0]*100, mlpdict2['15'][0]*100, mlpdict2['20'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (U_SWR_PR_DF)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Minimum document frequency',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [11]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict2['5'][3]*100, nbdict2['10'][3]*100, nbdict2['15'][3]*100, nbdict2['20'][3]*100],
    x = [5,10,15,20],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [5,10,15,20],
    y = [svmdict2['5'][3]*100, svmdict2['10'][3]*100, svmdict2['15'][3]*100, svmdict2['20'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [5,10,15,20],
    y = [lrdict2['5'][3]*100, lrdict2['10'][3]*100, lrdict2['15'][3]*100, lrdict2['20'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [5,10,15,20],
    y = [mlpdict2['5'][3]*100, mlpdict2['10'][3]*100, mlpdict2['15'][3]*100, mlpdict2['20'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (U_SWR_PR_DF)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Minimum document frequency',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        range=[40,100],
        ticklen= 5,
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 3. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams (U), stop words removed(SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while also handling negations(HN).

**NOTE**: The mark negation function takes about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers. 

In [75]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, norm='l2')
tokens500 = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 588.45 seconds


In [9]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, norm='l2')
tokens1000 = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 575.82 seconds


In [24]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, norm='l2')
tokens1500 = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 577.43 seconds


In [26]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, norm='l2')
tokens2500 = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 580.49 seconds


In [28]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, norm='l2')
tokens5000 = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 577.16 seconds


In [24]:
def runClassifiersUMN(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500
        elif MaxFeatures == 1000:
            tokens = tokens1000
        elif MaxFeatures == 1500:
            tokens = tokens1500
        elif MaxFeatures == 2500:
            tokens = tokens2500
        elif MaxFeatures == 5000:
            tokens = tokens5000
        
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [25]:
start_time = timer()
nbdict3, svmdict3, lrdict3, mlpdict3 = runClassifiersUMN()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 562.18 seconds


In [12]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("Results\\U_SWR_HN\\nbdict.npy", nbdict3)
# np.save("Results\\U_SWR_HN\\svmdict.npy", svmdict3)
# np.save("Results\\U_SWR_HN\\lrdict.npy", lrdict3)
# np.save("Results\\U_SWR_HN\\mlpdict.npy", mlpdict3)

## to load results from file
nbdict3 = np.load("Results\\U_SWR_HN\\nbdict.npy").item()
svmdict3 = np.load("Results\\U_SWR_HN\\svmdict.npy").item()
lrdict3 = np.load("Results\\U_SWR_HN\\lrdict.npy").item()
mlpdict3 = np.load("Results\\U_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [26]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict3['500'][0]*100, nbdict3['1000'][0]*100, nbdict3['1500'][0]*100, nbdict3['2500'][0]*100, nbdict3['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict3['500'][0]*100, svmdict3['1000'][0]*100, svmdict3['1500'][0]*100, svmdict3['2500'][0]*100, svmdict3['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict3['500'][0]*100, lrdict3['1000'][0]*100, lrdict3['1500'][0]*100, lrdict3['2500'][0]*100, lrdict3['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict3['500'][0]*100, mlpdict3['1000'][0]*100, mlpdict3['1500'][0]*100, mlpdict3['2500'][0]*100, mlpdict3['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (U_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [27]:
## PLOT THE F1-SCORE
# Create traces
trace0 = Scatter(
    y = [nbdict3['500'][2]*100, nbdict3['1000'][3]*100, nbdict3['1500'][3]*100, nbdict3['2500'][3]*100, nbdict3['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict3['500'][3]*100, svmdict3['1000'][3]*100, svmdict3['1500'][3]*100, svmdict3['2500'][3]*100, svmdict3['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict3['500'][3]*100, lrdict3['1000'][3]*100, lrdict3['1500'][3]*100, lrdict3['2500'][3]*100, lrdict3['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict3['500'][3]*100, mlpdict3['1000'][3]*100, mlpdict3['1500'][3]*100, mlpdict3['2500'][3]*100, mlpdict3['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (U_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 4. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with bigrams(only) (B), stop words removed (SWR), punctuations removed (tfidfvectorizer default setting) (PR) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [29]:
def runClassifiersB(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("Running classifiers for feature size : {}".format(MaxFeatures))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, ngram_range=(2, 2), norm='l2')
        tokens = tfidf_vect.fit_transform(df_final["text"].tolist())
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logistic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [30]:
start_time = timer()
nbdict4, svmdict4, lrdict4, mlpdict4 = runClassifiersB()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 694.44 seconds


In [16]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("Results\\B_SWR_PR\\nbdict.npy", nbdict4)
# np.save("Results\\B_SWR_PR\\svmdict.npy", svmdict4)
# np.save("Results\\B_SWR_PR\\lrdict.npy", lrdict4)
# np.save("Results\\B_SWR_PR\\mlpdict.npy", mlpdict4)

# to load results from file
nbdict4 = np.load("Results\\B_SWR_PR\\nbdict.npy").item()
svmdict4 = np.load("Results\\B_SWR_PR\\svmdict.npy").item()
lrdict4 = np.load("Results\\B_SWR_PR\\lrdict.npy").item()
mlpdict4 = np.load("Results\\B_SWR_PR\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [28]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict4['500'][0]*100, nbdict4['1000'][0]*100, nbdict4['1500'][0]*100, nbdict4['2500'][0]*100, nbdict4['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict4['500'][0]*100, svmdict4['1000'][0]*100, svmdict4['1500'][0]*100, svmdict4['2500'][0]*100, svmdict4['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict4['500'][0]*100, lrdict4['1000'][0]*100, lrdict4['1500'][0]*100, lrdict4['2500'][0]*100, lrdict4['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict4['500'][0]*100, mlpdict4['1000'][0]*100, mlpdict4['1500'][0]*100, mlpdict4['2500'][0]*100, mlpdict4['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (B_SWR_PR)',
    hovermode= 'closest',
    width=1000,
    height=700,    
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        range=[40,100],
        ticklen= 5,
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [29]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict4['500'][3]*100, nbdict4['1000'][3]*100, nbdict4['1500'][3]*100, nbdict4['2500'][3]*100, nbdict4['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict4['500'][3]*100, svmdict4['1000'][3]*100, svmdict4['1500'][3]*100, svmdict4['2500'][3]*100, svmdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict4['500'][3]*100, lrdict4['1000'][3]*100, lrdict4['1500'][3]*100, lrdict4['2500'][3]*100, lrdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict4['500'][3]*100, mlpdict4['1000'][3]*100, mlpdict4['1500'][3]*100, mlpdict4['2500'][3]*100, mlpdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification f1-score for different algorithms (B_SWR_PR)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 5. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with bigrams(only) (B), stop words removed (SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while handling negations (HN).

**NOTE**: The mark negation function takes about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers.

In [47]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, ngram_range=(2, 2), norm='l2')
tokens500B = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 632.1 seconds


In [49]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, ngram_range=(2, 2), norm='l2')
tokens1000B = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 642.98 seconds


In [53]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, ngram_range=(2, 2), norm='l2')
tokens1500B = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 644.36 seconds


In [57]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, ngram_range=(2, 2), norm='l2')
tokens2500B = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 624.81 seconds


In [59]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, ngram_range=(2, 2), norm='l2')
tokens5000B = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 625.79 seconds


In [37]:
def runClassifiersBN(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500B
        elif MaxFeatures == 1000:
            tokens = tokens1000B
        elif MaxFeatures == 1500:
            tokens = tokens1500B
        elif MaxFeatures == 2500:
            tokens = tokens2500B
        elif MaxFeatures == 5000:
            tokens = tokens5000B
            
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [38]:
start_time = timer()
nbdict5, svmdict5, lrdict5, mlpdict5 = runClassifiersBN()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 374.73 seconds


In [20]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\B_SWR_HN\\nbdict.npy", nbdict5)
# np.save("Results\\B_SWR_HN\\svmdict.npy", svmdict5)
# np.save("Results\\B_SWR_HN\\lrdict.npy", lrdict5)
# np.save("Results\\B_SWR_HN\\mlpdict.npy", mlpdict5)

## to load results from file
nbdict5 = np.load("Results\\B_SWR_HN\\nbdict.npy").item()
svmdict5 = np.load("Results\\B_SWR_HN\\svmdict.npy").item()
lrdict5 = np.load("Results\\B_SWR_HN\\lrdict.npy").item()
mlpdict5 = np.load("Results\\B_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [34]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict5['500'][0]*100, nbdict5['1000'][0]*100, nbdict5['1500'][0]*100, nbdict5['2500'][0]*100, nbdict5['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict5['500'][0]*100, svmdict5['1000'][0]*100, svmdict5['1500'][0]*100, svmdict5['2500'][0]*100, svmdict5['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict5['500'][0]*100, lrdict5['1000'][0]*100, lrdict5['1500'][0]*100, lrdict5['2500'][0]*100, lrdict5['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict5['500'][0]*100, mlpdict5['1000'][0]*100, mlpdict5['1500'][0]*100, mlpdict5['2500'][0]*100, mlpdict5['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (B_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [33]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict5['500'][3]*100, nbdict5['1000'][3]*100, nbdict5['1500'][3]*100, nbdict5['2500'][3]*100, nbdict5['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict5['500'][3]*100, svmdict5['1000'][3]*100, svmdict5['1500'][3]*100, svmdict5['2500'][3]*100, svmdict5['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict5['500'][3]*100, lrdict5['1000'][3]*100, lrdict5['1500'][3]*100, lrdict5['2500'][3]*100, lrdict5['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict5['500'][3]*100, mlpdict5['1000'][3]*100, mlpdict5['1500'][3]*100, mlpdict5['2500'][3]*100, mlpdict5['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (B_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 6. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams+bigrams (UB), stop words removed (SWR), punctuations removed (tfidfvectorizer default setting) (PR) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [40]:
def runClassifiersUB(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("Running classifiers for feature size : {}".format(MaxFeatures))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, ngram_range=(1, 2), norm='l2')
        tokens = tfidf_vect.fit_transform(df_final["text"].tolist())
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [41]:
start_time = timer()
nbdict6, svmdict6, lrdict6, mlpdict6 = runClassifiersUB()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 972.61 seconds


In [24]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\UB_SWR_PR\\nbdict.npy", nbdict6)
# np.save("Results\\UB_SWR_PR\\svmdict.npy", svmdict6)
# np.save("Results\\UB_SWR_PR\\lrdict.npy", lrdict6)
# np.save("Results\\UB_SWR_PR\\mlpdict.npy", mlpdict6)

## to load results from file
nbdict6 = np.load("Results\\UB_SWR_PR\\nbdict.npy").item()
svmdict6 = np.load("Results\\UB_SWR_PR\\svmdict.npy").item()
lrdict6 = np.load("Results\\UB_SWR_PR\\lrdict.npy").item()
mlpdict6 = np.load("Results\\UB_SWR_PR\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [32]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict6['500'][0]*100, nbdict6['1000'][0]*100, nbdict6['1500'][0]*100, nbdict6['2500'][0]*100, nbdict6['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict6['500'][0]*100, svmdict6['1000'][0]*100, svmdict6['1500'][0]*100, svmdict6['2500'][0]*100, svmdict6['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict6['500'][0]*100, lrdict6['1000'][0]*100, lrdict6['1500'][0]*100, lrdict6['2500'][0]*100, lrdict6['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict6['500'][0]*100, mlpdict6['1000'][0]*100, mlpdict6['1500'][0]*100, mlpdict6['2500'][0]*100, mlpdict6['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (UB_SWR_PR)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [35]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict4['500'][3]*100, nbdict4['1000'][3]*100, nbdict4['1500'][3]*100, nbdict4['2500'][3]*100, nbdict4['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict4['500'][3]*100, svmdict4['1000'][3]*100, svmdict4['1500'][3]*100, svmdict4['2500'][3]*100, svmdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict4['500'][3]*100, lrdict4['1000'][3]*100, lrdict4['1500'][3]*100, lrdict4['2500'][3]*100, lrdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict4['500'][3]*100, mlpdict4['1000'][3]*100, mlpdict4['1500'][3]*100, mlpdict4['2500'][3]*100, mlpdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification f1-score for different algorithms (UB_SWR_PR)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 7. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams+bigrams (UB), stop words removed (SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while handling the negations (HN).

**NOTE**: The mark negation function takes a about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers.

In [69]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, ngram_range=(1, 2), norm='l2')
tokens500UB = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 650.57 seconds


In [73]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, ngram_range=(1, 2), norm='l2')
tokens1000UB = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 646.99 seconds


In [84]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, ngram_range=(1, 2), norm='l2')
tokens1500UB = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 668.92 seconds


In [79]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, ngram_range=(1, 2), norm='l2')
tokens2500UB = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 663.37 seconds


In [81]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, ngram_range=(1, 2), norm='l2')
tokens5000UB = tfidf_vect.fit_transform(df_final["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 660.59 seconds


In [44]:
def runClassifiersUBN(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500UB
        elif MaxFeatures == 1000:
            tokens = tokens1000UB
        elif MaxFeatures == 1500:
            tokens = tokens1500UB
        elif MaxFeatures == 2500:
            tokens = tokens2500UB
        elif MaxFeatures == 5000:
            tokens = tokens5000UB
            
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_final["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000], tokens[200000:270000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000], tokens[270000:]])
        trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
        testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(3, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [45]:
start_time = timer()
nbdict7, svmdict7, lrdict7, mlpdict7 = runClassifiersUBN()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 593.92 seconds


In [36]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\UB_SWR_HN\\nbdict.npy", nbdict7)
# np.save("Results\\UB_SWR_HN\\svmdict.npy", svmdict7)
# np.save("Results\\UB_SWR_HN\\lrdict.npy", lrdict7)
# np.save("Results\\UB_SWR_HN\\mlpdict.npy", mlpdict7)

## to load results from file
nbdict7 = np.load("Results\\UB_SWR_HN\\nbdict.npy").item()
svmdict7 = np.load("Results\\UB_SWR_HN\\svmdict.npy").item()
lrdict7 = np.load("Results\\UB_SWR_HN\\lrdict.npy").item()
mlpdict7 = np.load("Results\\UB_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [38]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict7['500'][0]*100, nbdict7['1000'][0]*100, nbdict7['1500'][0]*100, nbdict7['2500'][0]*100, nbdict7['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict7['500'][0]*100, svmdict7['1000'][0]*100, svmdict7['1500'][0]*100, svmdict7['2500'][0]*100, svmdict7['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict7['500'][0]*100, lrdict7['1000'][0]*100, lrdict7['1500'][0]*100, lrdict7['2500'][0]*100, lrdict7['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict7['500'][0]*100, mlpdict7['1000'][0]*100, mlpdict7['1500'][0]*100, mlpdict7['2500'][0]*100, mlpdict7['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (UB_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [40]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict7['500'][3]*100, nbdict7['1000'][3]*100, nbdict7['1500'][3]*100, nbdict7['2500'][3]*100, nbdict7['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict7['500'][3]*100, svmdict7['1000'][3]*100, svmdict7['1500'][3]*100, svmdict7['2500'][3]*100, svmdict7['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict7['500'][3]*100, lrdict7['1000'][3]*100, lrdict7['1500'][3]*100, lrdict7['2500'][3]*100, lrdict7['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict7['500'][3]*100, mlpdict7['1000'][3]*100, mlpdict7['1500'][3]*100, mlpdict7['2500'][3]*100, mlpdict7['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification f1-score for different algorithms (UB_SWR_HN)',
    hovermode= 'closest',
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        gridwidth= 2,
        range=[40,100]
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 8. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm by using a trained (on the same yelp data) word2vec vector model. The final resulting vector is the frequency weighted average of all word vectors.

In [79]:
# Tokenize the reviews
st = timer()
tokenized_corpus = [word_tokenize(w) for w in df_final["text"]]
timer() - st

480.6981029508379

In [80]:
# Train a word2vec model on the tokenized corpus
st = timer()
model = gensim.models.Word2Vec(tokenized_corpus, size=100, window=10, min_count=5, workers=12)
timer() - st

130.34646401556165

In [81]:
# Generate word vectors for each review and
# get a weighted sum by frequency of words
stop_words = set(stopwords.words('english'))

start_time = timer()
word_vecs = []
for i in range(len(df_final["text"])):
    # Filter out stop words or words not in the trained model
    word = [w for w in tokenized_corpus[i] if w not in stop_words and w.isalpha() and w in model.wv.vocab]
    C = Counter(word)
    if len(word) == 0:
        # If none of the words in the review are in the model
        # the final vector is assigned as a zero vector
        word_vecs.append(np.zeros((100,)))
    else:
        word_vecs.append(np.average([model.wv.get_vector(w) * C[w] for w in word], axis=0, weights=[C[w] for w in word]))
        
print("Finished in : {} seconds\n".format(round(timer()-start_time,2)))

Finished in : 163.64 seconds



In [82]:
# We skip the naive bayes algorithm as it doesn't work with negative values
def runClassifiersW2V(showRes = False):
    
    labels = df_final["rating"].tolist()

    trainX = np.asarray(word_vecs[0:70000] + word_vecs[100000:170000] + word_vecs[200000:270000])
    testX = np.asarray(word_vecs[70000:100000] + word_vecs[170000:200000] + word_vecs[270000:])
    trainY = labels[0:70000] + labels[100000:170000] + labels[200000:270000] 
    testY = labels[70000:100000] + labels[170000:200000] + labels[270000:]

    # run linear SVM classifier
    start_time = timer()
    clf = SGDClassifier().fit(trainX, trainY)
    train_time = round(timer()-start_time,2)
    start_time = timer()
    predYSVC = clf.predict(testX)
    test_time = round(timer()-start_time,2)
    accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
    precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
    svmlist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predYSVC, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        print("\n*************Linear SVM Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))
        display(df_confusion)

    # run logistic regression classifier
    start_time = timer()
    clf = SGDClassifier('log').fit(trainX, trainY)
    train_time = round(timer()-start_time,2)
    start_time = timer()
    predYLR = clf.predict(testX)
    test_time = round(timer()-start_time,2)
    accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
    precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
    lrlist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predYLR, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        print("\n*************Logistic Regression Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))
        display(df_confusion)

    # transform the labels into one hot vector form
    label_encoder = LabelEncoder()
    trainYI = label_encoder.fit_transform(trainY)
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoder = OneHotEncoder(sparse=False)
    trainYI = trainYI.reshape(len(trainYI), 1)
    trainYC = onehot_encoder.fit_transform(trainYI)

    testYI = label_encoder.fit_transform(testY)
    testYI = testYI.reshape(len(testYI), 1)
    testYC = onehot_encoder.fit_transform(testYI)

    # define the model
    model = Sequential()
    model.add(Dense(256, input_dim=trainX.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(256, input_dim=50, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(3, activation = 'softmax'))

    # train the model
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    start_time = timer()
    model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
    train_time = round(timer()-start_time,2)

    # Predict on the test using the trained model
    start_time = timer()
    pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
    test_time = round(timer()-start_time,2)
    # process the predicted probailities to get the final labels encodedLabs
    encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
    # flatten the list
    predLabs = list(itertools.chain.from_iterable(encodedLabs))
    accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
    precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
    mlplist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        # Print the performance metrics
        print("\n*************MLP Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))

        # print the confusion matrix
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predLabs, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        display(df_confusion)
            
    return svmlist, lrlist, mlplist

In [83]:
st = timer()
svmlist, lrlist, mlplist = runClassifiersW2V(True)
print("Finished in : {} seconds\n".format(round(timer()-st,2)))


*************Linear SVM Classfier*************
Accuracy: 0.534
Macro averaged precision score: 0.576
Macro averaged recall score: 0.534
Macro averaged f-1 score: 0.484


Predicted,negative,neutral,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,26833,1071,2096
neutral,18762,4144,7094
positive,11277,1615,17108



*************Logistic Regression Classfier*************
Accuracy: 0.461
Macro averaged precision score: 0.568
Macro averaged recall score: 0.461
Macro averaged f-1 score: 0.407


Predicted,negative,neutral,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,9710,1344,18946
neutral,3244,4084,22672
positive,1087,1235,27678



*************MLP Classfier*************
Accuracy: 0.674
Macro averaged precision score: 0.679
Macro averaged recall score: 0.674
Macro averaged f-1 score: 0.673


Predicted,negative,neutral,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,23511,4787,1702
neutral,8070,17615,4315
positive,4207,6262,19531


Finished in : 50.43 seconds



In [41]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.array(svmlist).dump(open('Results\\Word2Vec\\svmlist.npy', 'wb'))
# np.array(lrlist).dump(open('Results\\Word2Vec\\lrlist.npy', 'wb'))
# np.array(mlplist).dump(open('Results\\Word2Vec\\mlplist.npy', 'wb'))

# Load the results
svmlist = np.load(open('Results\\Word2Vec\\svmlist.npy', 'rb'))
lrlist = np.load(open('Results\\Word2Vec\\lrlist.npy', 'rb'))
mlplist = np.load(open('Results\\Word2Vec\\mlplist.npy', 'rb'))

#### Plot the results

In [42]:
## PLOT THE ACCURACY PRECISION RECALL AND F1-SCORE
# Create traces
trace0 = Bar(
    y = [svmlist[0]*100, svmlist[1]*100, svmlist[2]*100, svmlist[3]*100],
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    text = [round(svmlist[0]*100,2), round(svmlist[1]*100,2), round(svmlist[2]*100,2), round(svmlist[3]*100,2)],
    textfont=dict(family='Calibri', size=14, color='black'),
    textposition = 'auto',
    name = 'Linear SVM',
    opacity=0.8
)

trace1 = Bar(
    y = [lrlist[0]*100, lrlist[1]*100, lrlist[2]*100, lrlist[3]*100],
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    text = [round(lrlist[0]*100,2), round(lrlist[1]*100,2), round(lrlist[2]*100,2), round(lrlist[3]*100,2)],
    textposition = 'auto',
    textfont=dict(family='Calibri', size=14, color='black'),
    name = 'Logistic Regression',
    opacity=0.8
)

trace2 = Bar(
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    y = [mlplist[0]*100, mlplist[1]*100, mlplist[2]*100, mlplist[3]*100],
    text = [round(mlplist[0]*100,2), round(mlplist[1]*100,2), round(mlplist[2]*100,2), round(mlplist[3]*100,2)],
    textfont=dict(family='Calibri', size=14, color='black'),
    textposition = 'auto',
    name = 'Neural Net',
    opacity=0.8
)

data = [trace0, trace1, trace2]
layout = Layout(title="Comparing the accuracy, precision, recall and accuracy of different classifiers (W2V_2C)", 
               yaxis=dict(title="Percent value"))
fig = Figure(data=data,layout=layout)
iplot(fig)

## Classification into two classes - 'positive' and 'negative'

---
### 1. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams (U), stop words removed (SWR), punctuations removed (PR) (tfidfvectorizer default setting) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [26]:
df_posneg = df_final.loc[(df_final["rating"] == 'negative') | (df_final["rating"] == 'positive')]

In [27]:
## Function that runs the classifiers on unigram features with stop words and punctuations removed
def runClassifiersU2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("\nRunning classifiers for feature size : {}".format(MaxFeatures))
        # Initialize the tfidf vectorizer 
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, norm='l2')
        # Run the vectorizer on the yelp reviews
        tokens = tfidf_vect.fit_transform(df_posneg["text"].tolist())
        labels = df_posneg["rating"].tolist()

        # Split into training and test set 70:30 split
        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        
        # if showRes is true the confusion matrix for each run is printed
        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logistic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [28]:
start_time = timer()
nbdict, svmdict, lrdict, mlpdict = runClassifiersU2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))


Running classifiers for feature size : 500

Running classifiers for feature size : 1000

Running classifiers for feature size : 1500

Running classifiers for feature size : 2500

Running classifiers for feature size : 5000
Finished in : 438.31 seconds


In [43]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("Results\\Two_class\\U_SWR_PR\\nbdict.npy", nbdict)
# np.save("Results\\Two_class\\U_SWR_PR\\svmdict.npy", svmdict)
# np.save("Results\\Two_class\\U_SWR_PR\\lrdict.npy", lrdict)
# np.save("Results\\Two_class\\U_SWR_PR\\mlpdict.npy", mlpdict)

## to load results from file
nbdict = np.load("Results\\Two_class\\U_SWR_PR\\nbdict.npy").item()
svmdict = np.load("Results\\Two_class\\U_SWR_PR\\svmdict.npy").item()
lrdict = np.load("Results\\Two_class\\U_SWR_PR\\lrdict.npy").item()
mlpdict = np.load("Results\\Two_class\\U_SWR_PR\\mlpdict.npy").item()

#### Plot the accuracy, training time, testing time and f1-score.

In [44]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][0]*100, nbdict['1000'][0]*100, nbdict['1500'][0]*100, nbdict['2500'][0]*100, nbdict['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][0]*100, svmdict['1000'][0]*100, svmdict['1500'][0]*100, svmdict['2500'][0]*100, svmdict['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][0]*100, lrdict['1000'][0]*100, lrdict['1500'][0]*100, lrdict['2500'][0]*100, lrdict['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][0]*100, mlpdict['1000'][0]*100, mlpdict['1500'][0]*100, mlpdict['2500'][0]*100, mlpdict['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [45]:
## PLOT THE TRAINING TIME
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][4], nbdict['1000'][4], nbdict['1500'][4], nbdict['2500'][4], nbdict['5000'][4]],
    x = [500,1000,1500,2500,5000],
    legendgroup="1",
    mode = 'lines+markers',
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
    #5799C7
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][4], svmdict['1000'][4], svmdict['1500'][4], svmdict['2500'][4], svmdict['5000'][4]],
    legendgroup="2",
    mode = 'lines+markers',
    name = 'Linear SVM',
    marker= {'color': 'rgb(255, 127, 14)'}
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][4], lrdict['1000'][4], lrdict['1500'][4], lrdict['2500'][4], lrdict['5000'][4]],
    mode = 'lines+markers',
    legendgroup="3",
    name = 'Logistic Regression',
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][4], mlpdict['1000'][4], mlpdict['1500'][4], mlpdict['2500'][4], mlpdict['5000'][4]],
    mode = 'lines+markers',
    legendgroup="4",
    name = 'Neural net',
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of training time for different algorithms (2C_U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Training time in seconds (log-scale)',
        type='log',
        ticklen= 5,
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [46]:
## PLOT THE TESTING TIME
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][5], nbdict['1000'][5], nbdict['1500'][5], nbdict['2500'][5], nbdict['5000'][5]],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][5], svmdict['1000'][5], svmdict['1500'][5], svmdict['2500'][5], svmdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][5], lrdict['1000'][5], lrdict['1500'][5], lrdict['2500'][5], lrdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][5], mlpdict['1000'][5], mlpdict['1500'][5], mlpdict['2500'][5], mlpdict['5000'][5]],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of testing time for different algorithms (2C_U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Testing time in seconds (log-scale)',
        ticklen= 5,
        gridwidth= 2,
        type='log'
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [47]:
## PLOT THE F1-SCORE
# Create traces
trace0 = Scatter(
    y = [nbdict['500'][2]*100, nbdict['1000'][3]*100, nbdict['1500'][3]*100, nbdict['2500'][3]*100, nbdict['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    name = 'Naive Bayes'
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict['500'][3]*100, svmdict['1000'][3]*100, svmdict['1500'][3]*100, svmdict['2500'][3]*100, svmdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict['500'][3]*100, lrdict['1000'][3]*100, lrdict['1500'][3]*100, lrdict['2500'][3]*100, lrdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression'
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict['500'][3]*100, mlpdict['1000'][3]*100, mlpdict['1500'][3]*100, mlpdict['2500'][3]*100, mlpdict['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net'
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_U_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2,
    )
)

data = [trace0,trace1,trace2,trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 2. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams, stop words removed, punctuations removed (tfidfvectorizer default setting), setting the feature size to 5000 and varying the minimum document frequency threshold in 5,10,15 and 20.

In [35]:
def runClassifiers_df2C(showRes = False):
    df_array = [5,10,15,20]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for mdf in df_array:
        print("Running classifiers with min document frequency threshold : {}".format(mdf))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = 5000, norm='l2', min_df=mdf)
        tokens = tfidf_vect.fit_transform(df_posneg["text"].tolist())
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(mdf)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [36]:
start_time = timer()
nbdict2, svmdict2, lrdict2, mlpdict2 = runClassifiers_df2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers with min document frequency threshold : 5
Running classifiers with min document frequency threshold : 10
Running classifiers with min document frequency threshold : 15
Running classifiers with min document frequency threshold : 20
Finished in : 409.09 seconds


In [48]:
## Save the results
# np.save("Results\\Two_class\\U_SWR_PR_DF\\nbdict.npy", nbdict2)
# np.save("Results\\Two_class\\U_SWR_PR_DF\\svmdict.npy", svmdict2)
# np.save("Results\\Two_class\\U_SWR_PR_DF\\lrdict.npy", lrdict2)
# np.save("Results\\Two_class\\U_SWR_PR_DF\\mlpdict.npy", mlpdict2)

## to load results from file
nbdict2 = np.load("Results\\Two_class\\U_SWR_PR_DF\\nbdict.npy").item()
svmdict2 = np.load("Results\\Two_class\\U_SWR_PR_DF\\svmdict.npy").item()
lrdict2 = np.load("Results\\Two_class\\U_SWR_PR_DF\\lrdict.npy").item()
mlpdict2 = np.load("Results\\Two_class\\U_SWR_PR_DF\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [49]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict2['5'][0]*100, nbdict2['10'][0]*100, nbdict2['15'][0]*100, nbdict2['20'][0]*100],
    x = [5,10,15,20],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [5,10,15,20],
    y = [svmdict2['5'][0]*100, svmdict2['10'][0]*100, svmdict2['15'][0]*100, svmdict2['20'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [5,10,15,20],
    y = [lrdict2['5'][0]*100, lrdict2['10'][0]*100, lrdict2['15'][0]*100, lrdict2['20'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [5,10,15,20],
    y = [mlpdict2['5'][0]*100, mlpdict2['10'][0]*100, mlpdict2['15'][0]*100, mlpdict2['20'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_U_SWR_PR_DF)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Minimum document frequency',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [50]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict2['5'][3]*100, nbdict2['10'][3]*100, nbdict2['15'][3]*100, nbdict2['20'][3]*100],
    x = [5,10,15,20],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [5,10,15,20],
    y = [svmdict2['5'][3]*100, svmdict2['10'][3]*100, svmdict2['15'][3]*100, svmdict2['20'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [5,10,15,20],
    y = [lrdict2['5'][3]*100, lrdict2['10'][3]*100, lrdict2['15'][3]*100, lrdict2['20'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [5,10,15,20],
    y = [mlpdict2['5'][3]*100, mlpdict2['10'][3]*100, mlpdict2['15'][3]*100, mlpdict2['20'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_U_SWR_PR_DF)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Minimum document frequency',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 3. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams (U), stop words removed(SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while also handling negations(HN).

**NOTE**: The mark negation function takes a about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers.

In [38]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, norm='l2')
tokens500 = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 377.52 seconds


In [39]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, norm='l2')
tokens1000 = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 371.31 seconds


In [40]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, norm='l2')
tokens1500 = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 371.96 seconds


In [41]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, norm='l2')
tokens2500 = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 372.16 seconds


In [42]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, norm='l2')
tokens5000 = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 374.07 seconds


In [46]:
def runClassifiersUMN2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500
        elif MaxFeatures == 1000:
            tokens = tokens1000
        elif MaxFeatures == 1500:
            tokens = tokens1500
        elif MaxFeatures == 2500:
            tokens = tokens2500
        elif MaxFeatures == 5000:
            tokens = tokens5000
        
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [47]:
start_time = timer()
nbdict3, svmdict3, lrdict3, mlpdict3 = runClassifiersUMN2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 379.38 seconds


In [52]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("Results\\Two_class\\U_SWR_HN\\nbdict.npy", nbdict3)
# np.save("Results\\Two_class\\U_SWR_HN\\svmdict.npy", svmdict3)
# np.save("Results\\Two_class\\U_SWR_HN\\lrdict.npy", lrdict3)
# np.save("Results\\Two_class\\U_SWR_HN\\mlpdict.npy", mlpdict3)

## to load results from file
nbdict3 = np.load("Results\\Two_class\\U_SWR_HN\\nbdict.npy").item()
svmdict3 = np.load("Results\\Two_class\\U_SWR_HN\\svmdict.npy").item()
lrdict3 = np.load("Results\\Two_class\\U_SWR_HN\\lrdict.npy").item()
mlpdict3 = np.load("Results\\Two_class\\U_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [53]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict3['500'][0]*100, nbdict3['1000'][0]*100, nbdict3['1500'][0]*100, nbdict3['2500'][0]*100, nbdict3['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict3['500'][0]*100, svmdict3['1000'][0]*100, svmdict3['1500'][0]*100, svmdict3['2500'][0]*100, svmdict3['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict3['500'][0]*100, lrdict3['1000'][0]*100, lrdict3['1500'][0]*100, lrdict3['2500'][0]*100, lrdict3['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict3['500'][0]*100, mlpdict3['1000'][0]*100, mlpdict3['1500'][0]*100, mlpdict3['2500'][0]*100, mlpdict3['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_U_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [55]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict3['500'][3]*100, nbdict3['1000'][3]*100, nbdict3['1500'][3]*100, nbdict3['2500'][3]*100, nbdict3['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict3['500'][3]*100, svmdict3['1000'][3]*100, svmdict3['1500'][3]*100, svmdict3['2500'][3]*100, svmdict3['5000'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict3['500'][3]*100, lrdict3['1000'][3]*100, lrdict3['1500'][3]*100, lrdict3['2500'][3]*100, lrdict3['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict3['500'][3]*100, mlpdict3['1000'][3]*100, mlpdict3['1500'][3]*100, mlpdict3['2500'][3]*100, mlpdict3['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_U_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 4. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with bigrams(only) (B), stop words removed (SWR), punctuations removed (tfidfvectorizer default setting) (PR) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [49]:
def runClassifiersB2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("Running classifiers for feature size : {}".format(MaxFeatures))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, ngram_range=(2, 2), norm='l2')
        tokens = tfidf_vect.fit_transform(df_posneg["text"].tolist())
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logistic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [50]:
start_time = timer()
nbdict4, svmdict4, lrdict4, mlpdict4 = runClassifiersB2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 479.17 seconds


In [56]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

## Save the results
# np.save("Results\\Two_class\\B_SWR_PR\\nbdict.npy", nbdict4)
# np.save("Results\\Two_class\\B_SWR_PR\\svmdict.npy", svmdict4)
# np.save("Results\\Two_class\\B_SWR_PR\\lrdict.npy", lrdict4)
# np.save("Results\\Two_class\\B_SWR_PR\\mlpdict.npy", mlpdict4)

## to load results from file
nbdict4 = np.load("Results\\Two_class\\B_SWR_PR\\nbdict.npy").item()
svmdict4 = np.load("Results\\Two_class\\B_SWR_PR\\svmdict.npy").item()
lrdict4 = np.load("Results\\Two_class\\B_SWR_PR\\lrdict.npy").item()
mlpdict4 = np.load("Results\\Two_class\\B_SWR_PR\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [57]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict4['500'][0]*100, nbdict4['1000'][0]*100, nbdict4['1500'][0]*100, nbdict4['2500'][0]*100, nbdict4['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict4['500'][0]*100, svmdict4['1000'][0]*100, svmdict4['1500'][0]*100, svmdict4['2500'][0]*100, svmdict4['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict4['500'][0]*100, lrdict4['1000'][0]*100, lrdict4['1500'][0]*100, lrdict4['2500'][0]*100, lrdict4['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict4['500'][0]*100, mlpdict4['1000'][0]*100, mlpdict4['1500'][0]*100, mlpdict4['2500'][0]*100, mlpdict4['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_B_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [58]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict4['500'][3]*100, nbdict4['1000'][3]*100, nbdict4['1500'][3]*100, nbdict4['2500'][3]*100, nbdict4['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict4['500'][3]*100, svmdict4['1000'][3]*100, svmdict4['1500'][3]*100, svmdict4['2500'][3]*100, svmdict4['5000'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict4['500'][3]*100, lrdict4['1000'][3]*100, lrdict4['1500'][3]*100, lrdict4['2500'][3]*100, lrdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict4['500'][3]*100, mlpdict4['1000'][3]*100, mlpdict4['1500'][3]*100, mlpdict4['2500'][3]*100, mlpdict4['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_B_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 5. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with bigrams(only) (B), stop words removed (SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while handling negations (HN).

**NOTE**: The mark negation function takes a about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers.

In [52]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, ngram_range=(2, 2), norm='l2')
tokens500B = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 416.23 seconds


In [53]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, ngram_range=(2, 2), norm='l2')
tokens1000B = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 415.9 seconds


In [54]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, ngram_range=(2, 2), norm='l2')
tokens1500B = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 414.8 seconds


In [55]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, ngram_range=(2, 2), norm='l2')
tokens2500B = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 405.35 seconds


In [56]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, ngram_range=(2, 2), norm='l2')
tokens5000B = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 404.84 seconds


In [58]:
def runClassifiersBN2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500B
        elif MaxFeatures == 1000:
            tokens = tokens1000B
        elif MaxFeatures == 1500:
            tokens = tokens1500B
        elif MaxFeatures == 2500:
            tokens = tokens2500B
        elif MaxFeatures == 5000:
            tokens = tokens5000B
            
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000]
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [59]:
start_time = timer()
nbdict5, svmdict5, lrdict5, mlpdict5 = runClassifiersBN2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 274.84 seconds


In [59]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\Two_class\\B_SWR_HN\\nbdict.npy", nbdict5)
# np.save("Results\\Two_class\\B_SWR_HN\\svmdict.npy", svmdict5)
# np.save("Results\\Two_class\\B_SWR_HN\\lrdict.npy", lrdict5)
# np.save("Results\\Two_class\\B_SWR_HN\\mlpdict.npy", mlpdict5)

## to load results from file
nbdict5 = np.load("Results\\Two_class\\B_SWR_HN\\nbdict.npy").item()
svmdict5 = np.load("Results\\Two_class\\B_SWR_HN\\svmdict.npy").item()
lrdict5 = np.load("Results\\Two_class\\B_SWR_HN\\lrdict.npy").item()
mlpdict5 = np.load("Results\\Two_class\\B_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [61]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict5['500'][0]*100, nbdict5['1000'][0]*100, nbdict5['1500'][0]*100, nbdict5['2500'][0]*100, nbdict5['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict5['500'][0]*100, svmdict5['1000'][0]*100, svmdict5['1500'][0]*100, svmdict5['2500'][0]*100, svmdict5['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict5['500'][0]*100, lrdict5['1000'][0]*100, lrdict5['1500'][0]*100, lrdict5['2500'][0]*100, lrdict5['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict5['500'][0]*100, mlpdict5['1000'][0]*100, mlpdict5['1500'][0]*100, mlpdict5['2500'][0]*100, mlpdict5['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_B_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [62]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict5['500'][3]*100, nbdict5['1000'][3]*100, nbdict5['1500'][3]*100, nbdict5['2500'][3]*100, nbdict5['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict5['500'][3]*100, svmdict5['1000'][3]*100, svmdict5['1500'][3]*100, svmdict5['2500'][3]*100, svmdict5['5000'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict5['500'][3]*100, lrdict5['1000'][3]*100, lrdict5['1500'][3]*100, lrdict5['2500'][3]*100, lrdict5['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict5['500'][3]*100, mlpdict5['1000'][3]*100, mlpdict5['1500'][3]*100, mlpdict5['2500'][3]*100, mlpdict5['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_B_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 6. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams+bigrams (UB), stop words removed (SWR), punctuations removed (tfidfvectorizer default setting) (PR) and vary the feature size in 500, 1000, 1500, 2500 and 5000.

In [61]:
def runClassifiersUB2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        print("Running classifiers for feature size : {}".format(MaxFeatures))
        tfidf_vect = TfidfVectorizer(stop_words="english", max_features = MaxFeatures, ngram_range=(1, 2), norm='l2')
        tokens = tfidf_vect.fit_transform(df_posneg["text"].tolist())
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [62]:
start_time = timer()
nbdict6, svmdict6, lrdict6, mlpdict6 = runClassifiersUB2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 671.42 seconds


In [63]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\Two_class\\UB_SWR_PR\\nbdict.npy", nbdict6)
# np.save("Results\\Two_class\\UB_SWR_PR\\svmdict.npy", svmdict6)
# np.save("Results\\Two_class\\UB_SWR_PR\\lrdict.npy", lrdict6)
# np.save("Results\\Two_class\\UB_SWR_PR\\mlpdict.npy", mlpdict6)

## to load results from file
nbdict6 = np.load("Results\\Two_class\\UB_SWR_PR\\nbdict.npy").item()
svmdict6 = np.load("Results\\Two_class\\UB_SWR_PR\\svmdict.npy").item()
lrdict6 = np.load("Results\\Two_class\\UB_SWR_PR\\lrdict.npy").item()
mlpdict6 = np.load("Results\\Two_class\\UB_SWR_PR\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [64]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict6['500'][0]*100, nbdict6['1000'][0]*100, nbdict6['1500'][0]*100, nbdict6['2500'][0]*100, nbdict6['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict6['500'][0]*100, svmdict6['1000'][0]*100, svmdict6['1500'][0]*100, svmdict6['2500'][0]*100, svmdict6['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict6['500'][0]*100, lrdict6['1000'][0]*100, lrdict6['1500'][0]*100, lrdict6['2500'][0]*100, lrdict6['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict6['500'][0]*100, mlpdict6['1000'][0]*100, mlpdict6['1500'][0]*100, mlpdict6['2500'][0]*100, mlpdict6['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_UB_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [65]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict6['500'][3]*100, nbdict6['1000'][3]*100, nbdict6['1500'][3]*100, nbdict6['2500'][3]*100, nbdict6['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict6['500'][3]*100, svmdict6['1000'][3]*100, svmdict6['1500'][3]*100, svmdict6['2500'][3]*100, svmdict6['5000'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict6['500'][3]*100, lrdict6['1000'][3]*100, lrdict6['1500'][3]*100, lrdict6['2500'][3]*100, lrdict6['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict6['500'][3]*100, mlpdict6['1000'][3]*100, mlpdict6['1500'][3]*100, mlpdict6['2500'][3]*100, mlpdict6['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_UB_SWR_PR)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 7. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm on using tfidf vectors with unigrams+bigrams (UB), stop words removed (SWR) and vary the feature size in 500, 1000, 1500, 2500 and 5000, while handling the negations (HN).

**NOTE**: The mark negation function takes a about 10 mins to run for each feature size. For 5 feature sizes it takes around 50 mins to get the tfidf vectors with negations marked and another 10 mins to run the classifiers.

In [64]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 500, ngram_range=(1, 2), norm='l2')
tokens500UB = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 438.46 seconds


In [65]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1000, ngram_range=(1, 2), norm='l2')
tokens1000UB = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 431.92 seconds


In [66]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 1500, ngram_range=(1, 2), norm='l2')
tokens1500UB = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 432.46 seconds


In [67]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 2500, ngram_range=(1, 2), norm='l2')
tokens2500UB = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 433.65 seconds


In [68]:
start_time = timer()
tfidf_vect = TfidfVectorizer(analyzer="word", tokenizer=lambda text: mark_negation(word_tokenize(text)), stop_words="english", max_features = 5000, ngram_range=(1, 2), norm='l2')
tokens5000UB = tfidf_vect.fit_transform(df_posneg["text"].tolist())
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Finished in : 429.39 seconds


In [70]:
def runClassifiersUBN2C(showRes = False):
    
    mf_array = [500,1000,1500,2500,5000]
    nbdict = {}
    svmdict = {}
    lrdict = {}
    mlpdict = {}
    
    for MaxFeatures in mf_array:
        if MaxFeatures == 500:
            tokens = tokens500UB
        elif MaxFeatures == 1000:
            tokens = tokens1000UB
        elif MaxFeatures == 1500:
            tokens = tokens1500UB
        elif MaxFeatures == 2500:
            tokens = tokens2500UB
        elif MaxFeatures == 5000:
            tokens = tokens5000UB
            
        print("Running classifiers for feature size : {}".format(MaxFeatures))
                
        labels = df_posneg["rating"].tolist()

        trainX = vstack([tokens[0:70000], tokens[100000:170000]])
        testX = vstack([tokens[70000:100000], tokens[170000:200000]])
        trainY = labels[0:70000] + labels[100000:170000] 
        testY = labels[70000:100000] + labels[170000:200000]

        # run naive bayes classifier
        start_time = timer()
        clf = MultinomialNB().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYMNB = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYMNB),3)
        precision = round(sklearn.metrics.precision_score(testY, predYMNB, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYMNB, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYMNB, average='macro'),3)
        nbdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]
        

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYMNB, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)    
            print("*************Naive Bayes Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run linear SVM classifier
        start_time = timer()
        clf = SGDClassifier().fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYSVC = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
        precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
        svmdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYSVC, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Linear SVM Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # run logistic regression classifier
        start_time = timer()
        clf = SGDClassifier('log').fit(trainX, trainY)
        train_time = round(timer()-start_time,2)
        start_time = timer()
        predYLR = clf.predict(testX)
        test_time = round(timer()-start_time,2)
        accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
        precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
        lrdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predYLR, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            print("\n*************Logitic Regression Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

        # transform the labels into one hot vector form
        label_encoder = LabelEncoder()
        trainYI = label_encoder.fit_transform(trainY)
        onehot_encoder = OneHotEncoder(sparse=False)
        onehot_encoder = OneHotEncoder(sparse=False)
        trainYI = trainYI.reshape(len(trainYI), 1)
        trainYC = onehot_encoder.fit_transform(trainYI)

        testYI = label_encoder.fit_transform(testY)
        testYI = testYI.reshape(len(testYI), 1)
        testYC = onehot_encoder.fit_transform(testYI)

        # define the model
        x     = Input(shape = (trainX.shape[1], ), dtype = 'float32', sparse = True)     
        d1    = Dense(256, activation='relu')(x)
        b1    = BatchNormalization()(d1)
        d2    = Dropout(0.5)(b1)
        d3    = Dense(256, activation='relu')(d2)
        b2    = BatchNormalization()(d2)
        d4    = Dropout(0.5)(b2)
        out   = Dense(2, activation = 'softmax')(d4)
        model = Model(x,out)

        # train the model
        model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        start_time = timer()
        model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
        train_time = round(timer()-start_time,2)

        # Predict on the test using the trained model
        start_time = timer()
        pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
        test_time = round(timer()-start_time,2)
        # process the predicted probailities to get the final labels encodedLabs
        encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
        # flatten the list
        predLabs = list(itertools.chain.from_iterable(encodedLabs))
        accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
        precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
        recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
        f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
        mlpdict[str(MaxFeatures)] = [accuracy, precision, recall, f1score, train_time, test_time]

        if showRes:
            # Print the performance metrics
            print("\n*************MLP Classfier*************")
            print("Accuracy: {}".format(accuracy))
            print("Macro averaged precision score: {}".format(precision))
            print("Macro averaged recall score: {}".format(recall))
            print("Macro averaged f-1 score: {}".format(f1score))
            display(df_confusion)

            # print the confusion matrix
            test_y = pd.Series(testY, name='Actual')
            pred_y = pd.Series(predLabs, name='Predicted')
            df_confusion = pd.crosstab(test_y, pred_y)
            display(df_confusion)
            
    return nbdict, svmdict, lrdict, mlpdict

In [71]:
start_time = timer()
nbdict7, svmdict7, lrdict7, mlpdict7 = runClassifiersUBN2C()
print("Finished in : {} seconds".format(round(timer()-start_time,2)))

Running classifiers for feature size : 500
Running classifiers for feature size : 1000
Running classifiers for feature size : 1500
Running classifiers for feature size : 2500
Running classifiers for feature size : 5000
Finished in : 422.93 seconds


In [66]:
### RUN THIS CELL TO LOAD THE SAVED RESULTS
### **This assumes the Results folder is in the current working directory

# Save the results
# np.save("Results\\Two_class\\UB_SWR_HN\\nbdict.npy", nbdict7)
# np.save("Results\\Two_class\\UB_SWR_HN\\svmdict.npy", svmdict7)
# np.save("Results\\Two_class\\UB_SWR_HN\\lrdict.npy", lrdict7)
# np.save("Results\\Two_class\\UB_SWR_HN\\mlpdict.npy", mlpdict7)

## to load results from file
nbdict7 = np.load("Results\\Two_class\\UB_SWR_HN\\nbdict.npy").item()
svmdict7 = np.load("Results\\Two_class\\UB_SWR_HN\\svmdict.npy").item()
lrdict7 = np.load("Results\\Two_class\\UB_SWR_HN\\lrdict.npy").item()
mlpdict7 = np.load("Results\\Two_class\\UB_SWR_HN\\mlpdict.npy").item()

#### Plot the accuracy and f1-score.

In [67]:
## PLOT THE ACCURACY
# Create traces
trace0 = Scatter(
    y = [nbdict7['500'][0]*100, nbdict7['1000'][0]*100, nbdict7['1500'][0]*100, nbdict7['2500'][0]*100, nbdict7['5000'][0]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict7['500'][0]*100, svmdict7['1000'][0]*100, svmdict7['1500'][0]*100, svmdict7['2500'][0]*100, svmdict7['5000'][0]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict7['500'][0]*100, lrdict7['1000'][0]*100, lrdict7['1500'][0]*100, lrdict7['2500'][0]*100, lrdict7['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict7['500'][0]*100, mlpdict7['1000'][0]*100, mlpdict7['1500'][0]*100, mlpdict7['2500'][0]*100, mlpdict7['5000'][0]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification accuracy for different algorithms (2C_UB_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'Accuracy in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

In [68]:
## PLOT THE F1-score
# Create traces
trace0 = Scatter(
    y = [nbdict7['500'][3]*100, nbdict7['1000'][3]*100, nbdict7['1500'][3]*100, nbdict7['2500'][3]*100, nbdict7['5000'][3]*100],
    x = [500,1000,1500,2500,5000],
    mode = 'lines+markers',
    legendgroup="1",
    name = 'Naive Bayes',
    marker= {'color': 'rgb(0, 93, 171)'}
)
trace1 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [svmdict7['500'][3]*100, svmdict7['1000'][3]*100, svmdict7['1500'][3]*100, svmdict7['2500'][3]*100, svmdict7['5000'][3]*100],
    mode = 'lines+markers',
    legendgroup="2",
    marker= {'color': 'rgb(255, 127, 14)'},
    name = 'Linear SVM'
)
trace2 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [lrdict7['500'][3]*100, lrdict7['1000'][3]*100, lrdict7['1500'][3]*100, lrdict7['2500'][3]*100, lrdict7['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Logistic Regression',
    legendgroup="3",
    marker= {'color': 'rgb(74, 174, 74)'}
)
trace3 = Scatter(
    x = [500,1000,1500,2500,5000],
    y = [mlpdict7['500'][3]*100, mlpdict7['1000'][3]*100, mlpdict7['1500'][3]*100, mlpdict7['2500'][3]*100, mlpdict7['5000'][3]*100],
    mode = 'lines+markers',
    name = 'Neural net',
    legendgroup="4",
    marker= {'color': 'rgb(214, 39, 40)'}
)

layout = Layout(
    title= 'Comparison of classification F1-score for different algorithms (2C_UB_SWR_HN)',
    hovermode= 'closest',
    autosize=False,
    width=1000,
    height=700,
    xaxis= dict(
        title= 'Feature size',
        ticklen= 5,
        zeroline= False,
        gridwidth= 2,
    ),
    yaxis=dict(
        title= 'F1-score in percentage',
        ticklen= 5,
        range=[40,100],
        gridwidth= 2
    )
)

data = [trace0, trace1, trace2, trace3]
fig = Figure(data=data, layout=layout)
iplot(fig)

### 8. Run Naïve Bayes, Linear SVM, Logistic Regression and MLP algorithm by using a trained (on the same yelp data) word2vec vector model. The final resulting vector is the frequency weighted average of all word vectors.

In [73]:
# Tokenize the reviews
st = timer()
tokenized_corpus = [word_tokenize(w) for w in df_posneg["text"]]
timer() - st

308.916084148148

In [74]:
# Train a word2vec model on the tokenized corpus
st = timer()
model = gensim.models.Word2Vec(tokenized_corpus, size=100, window=10, min_count=5, workers=12)
timer() - st

84.42857916049434

In [75]:
# Generate word vectors for each review and
# get a weighted sum by frequency of words
stop_words = set(stopwords.words('english'))

start_time = timer()
word_vecs = []
for i in range(len(df_posneg["text"])):
    # Filter out stop words or words not in the trained model
    word = [w for w in tokenized_corpus[i] if w not in stop_words and w.isalpha() and w in model.wv.vocab]
    C = Counter(word)
    if len(word) == 0:
        # If none of the words in the review are in the model
        # the final vector is assigned as a zero vector
        word_vecs.append(np.zeros((100,)))
    else:
        word_vecs.append(np.average([model.wv.get_vector(w) * C[w] for w in word], axis=0, weights=[C[w] for w in word]))
        
print("Finished in : {} seconds\n".format(round(timer()-start_time,2)))

Finished in : 113.12 seconds



In [76]:
# We skip the naive bayes algorithm as it doesn't work with negative values
def runClassifiersW2V2C(showRes = False):
    
    labels = df_posneg["rating"].tolist()

    trainX = np.asarray(word_vecs[0:70000] + word_vecs[100000:170000])
    testX = np.asarray(word_vecs[70000:100000] + word_vecs[170000:200000])
    trainY = labels[0:70000] + labels[100000:170000] 
    testY = labels[70000:100000] + labels[170000:200000]

    # run linear SVM classifier
    start_time = timer()
    clf = SGDClassifier().fit(trainX, trainY)
    train_time = round(timer()-start_time,2)
    start_time = timer()
    predYSVC = clf.predict(testX)
    test_time = round(timer()-start_time,2)
    accuracy = round(sklearn.metrics.accuracy_score(testY, predYSVC),3)
    precision = round(sklearn.metrics.precision_score(testY, predYSVC, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predYSVC, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predYSVC, average='macro'),3)
    svmlist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predYSVC, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        print("\n*************Linear SVM Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))
        display(df_confusion)

    # run logistic regression classifier
    start_time = timer()
    clf = SGDClassifier('log').fit(trainX, trainY)
    train_time = round(timer()-start_time,2)
    start_time = timer()
    predYLR = clf.predict(testX)
    test_time = round(timer()-start_time,2)
    accuracy = round(sklearn.metrics.accuracy_score(testY, predYLR),3)
    precision = round(sklearn.metrics.precision_score(testY, predYLR, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predYLR, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predYLR, average='macro'),3)
    lrlist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predYLR, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        print("\n*************Logistic Regression Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))
        display(df_confusion)

    # transform the labels into one hot vector form
    label_encoder = LabelEncoder()
    trainYI = label_encoder.fit_transform(trainY)
    onehot_encoder = OneHotEncoder(sparse=False)
    onehot_encoder = OneHotEncoder(sparse=False)
    trainYI = trainYI.reshape(len(trainYI), 1)
    trainYC = onehot_encoder.fit_transform(trainYI)

    testYI = label_encoder.fit_transform(testY)
    testYI = testYI.reshape(len(testYI), 1)
    testYC = onehot_encoder.fit_transform(testYI)

    # define the model
    model = Sequential()
    model.add(Dense(256, input_dim=trainX.shape[1], activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(256, input_dim=50, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(2, activation = 'softmax'))

    # train the model
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    start_time = timer()
    model.fit(x=trainX, y=trainYC, batch_size=1000, epochs=20, verbose=0)
    train_time = round(timer()-start_time,2)

    # Predict on the test using the trained model
    start_time = timer()
    pred_probs = model.predict(x=testX, batch_size=1000, verbose=0)
    test_time = round(timer()-start_time,2)
    # process the predicted probailities to get the final labels encodedLabs
    encodedLabs = (label_encoder.inverse_transform(np.array([[int(p.argmax())] for p in pred_probs], dtype=np.int64))).tolist()
    # flatten the list
    predLabs = list(itertools.chain.from_iterable(encodedLabs))
    accuracy = round(sklearn.metrics.accuracy_score(testY, predLabs),3)
    precision = round(sklearn.metrics.precision_score(testY, predLabs, average='macro'),3)
    recall = round(sklearn.metrics.recall_score(testY, predLabs, average='macro'),3)
    f1score = round(sklearn.metrics.f1_score(testY, predLabs, average='macro'),3)
    mlplist = [accuracy, precision, recall, f1score, train_time, test_time]

    if showRes:
        # Print the performance metrics
        print("\n*************MLP Classfier*************")
        print("Accuracy: {}".format(accuracy))
        print("Macro averaged precision score: {}".format(precision))
        print("Macro averaged recall score: {}".format(recall))
        print("Macro averaged f-1 score: {}".format(f1score))

        # print the confusion matrix
        test_y = pd.Series(testY, name='Actual')
        pred_y = pd.Series(predLabs, name='Predicted')
        df_confusion = pd.crosstab(test_y, pred_y)
        display(df_confusion)
            
    return svmlist, lrlist, mlplist

In [77]:
svmlist, lrlist, mlplist = runClassifiersW2V2C(True)


*************Linear SVM Classfier*************
Accuracy: 0.802
Macro averaged precision score: 0.808
Macro averaged recall score: 0.802
Macro averaged f-1 score: 0.801


Predicted,negative,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,26139,3861
positive,8021,21979



*************Logistic Regression Classfier*************
Accuracy: 0.716
Macro averaged precision score: 0.771
Macro averaged recall score: 0.716
Macro averaged f-1 score: 0.701


Predicted,negative,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,14766,15234
positive,1786,28214



*************MLP Classfier*************
Accuracy: 0.843
Macro averaged precision score: 0.845
Macro averaged recall score: 0.843
Macro averaged f-1 score: 0.842


Predicted,negative,positive
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,26586,3414
positive,6033,23967


In [69]:
# Save the results
# np.array(svmlist).dump(open('Results\\Two_class\\Word2Vec\\svmlist.npy', 'wb'))
# np.array(lrlist).dump(open('Results\\Two_class\\Word2Vec\\lrlist.npy', 'wb'))
# np.array(mlplist).dump(open('Results\\Two_class\\Word2Vec\\mlplist.npy', 'wb'))

# Load the results
svmlist = np.load(open('Results\\Two_class\\Word2Vec\\svmlist.npy', 'rb'))
lrlist = np.load(open('Results\\Two_class\\Word2Vec\\lrlist.npy', 'rb'))
mlplist = np.load(open('Results\\Two_class\\Word2Vec\\mlplist.npy', 'rb'))

#### Plot the results

In [70]:
## PLOT THE ACCURACY PRECISION RECALL AND F1-SCORE
# Create traces
trace0 = Bar(
    y = [svmlist[0]*100, svmlist[1]*100, svmlist[2]*100, svmlist[3]*100],
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    text = [round(svmlist[0]*100,2), round(svmlist[1]*100,2), round(svmlist[2]*100,2), round(svmlist[3]*100,2)],
    textfont=dict(family='Calibri', size=14, color='black'),
    textposition = 'auto',
    name = 'Linear SVM',
    opacity=0.8
)

trace1 = Bar(
    y = [lrlist[0]*100, lrlist[1]*100, lrlist[2]*100, lrlist[3]*100],
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    text = [round(lrlist[0]*100,2), round(lrlist[1]*100,2), round(lrlist[2]*100,2), round(lrlist[3]*100,2)],
    textposition = 'auto',
    textfont=dict(family='Calibri', size=14, color='black'),
    name = 'Logistic Regression',
    opacity=0.8
)

trace2 = Bar(
    x = ['Accuracy', 'Precision', 'Recall', 'F1-score'],
    y = [mlplist[0]*100, mlplist[1]*100, mlplist[2]*100, mlplist[3]*100],
    text = [round(mlplist[0]*100,2), round(mlplist[1]*100,2), round(mlplist[2]*100,2), round(mlplist[3]*100,2)],
    textfont=dict(family='Calibri', size=14, color='black'),
    textposition = 'auto',
    name = 'Neural Net',
    opacity=0.8
)

data = [trace0, trace1, trace2]
layout = Layout(title="Comparing the accuracy, precision, recall and accuracy of different classifiers (W2V_2C)", 
               yaxis=dict(title="Percent value"))
fig = Figure(data=data,layout=layout)
iplot(fig)