### The Sentence Transformer was run in Google Colab under GPU setting

In [None]:
%%bash
pip install -q transformers

In [None]:
!pip install -U sentence-transformers

In [None]:
## load transformers
import numpy as np
import pandas as pd
from sklearn import preprocessing
from timeit import default_timer
import tensorflow as tf
import tensorflow.keras as keras
import io
import pickle

In [None]:
## load training data set
from google.colab import files
uploaded_train = files.upload()

In [None]:
## load testing data set
from google.colab import files
uploaded_test = files.upload()

In [None]:
## read the datasets
train = pd.read_csv(io.BytesIO(uploaded_train['train.tsv']), sep='\t')
test = pd.read_csv(io.BytesIO(uploaded_test['test.tsv']), sep = '\t')

In [None]:
from sentence_transformers import SentenceTransformer
#there are about 26 pretrained models
#roberta-large-nli-stsb-mean-tokens - returns 1024 dimentional vector
#distilbert-base-nli-stsb-mean-tokens - returns 768 dimentional vector

pretrained_model = 'roberta-large-nli-stsb-mean-tokens' #STSb performance is highest
model = SentenceTransformer(pretrained_model)

In [None]:
TRANSFORMER_BATCH=128

def count_embedd (df):
    idx_chunk=list(df.columns).index('Phrase')
    embedd_lst = []
    for index in range (0, df.shape[0], TRANSFORMER_BATCH):
        embedds = model.encode(df.iloc[index:index+TRANSFORMER_BATCH, idx_chunk].values, show_progress_bar=False)
        embedd_lst.append(embedds)
    return np.concatenate(embedd_lst)

In [None]:
# sentence embeddings for TRAIN dataset, 1024 dimentions each
start_time = default_timer()
train_embedd = count_embedd(train)
print("Train embeddings: {}: in: {:5.2f}s".format(train_embedd.shape, default_timer() - start_time))

In [None]:
# sentence embeddings for TEST dataset, 1024 dimentions each
start_time = default_timer()
test_embedd = count_embedd(test)
print("Test embeddings: {}: in: {:5.2f}s".format(test_embedd.shape, default_timer() - start_time))

In [None]:
#save the train_embedd content into local
import pickle
with open('train_embedd.pickle', 'wb') as f:
    pickle.dump(train_embedd, f)

In [None]:
#save the test_embedd content into local
import pickle
with open('test_embedd.pickle', 'wb') as f:
    pickle.dump(test_embedd, f)

### The following work were all run in the local Jupter notebook

In [None]:
import numpy as np
import pandas as pd

In [None]:
train = pd.read_csv('train.tsv', sep='\t') # read the original train dataset
test = pd.read_csv('test.tsv', sep= '\t') # read the original test dataset

In [None]:
# load the sentence-transformed training dataset
import pickle
with open('train_embedd.pickle', 'rb') as f:
    train_embedd = pickle.load(f)

In [None]:
# load the sentence-transformed testing dataset
import pickle
with open('test_embedd.pickle', 'rb') as f:
    test_embedd = pickle.load(f)

### sentence-transformed training dataset split into "train" and "test" datasets to balance the number of phrases among classes

In [None]:
Xtr = train_embedd
ytr = train['Sentiment']
c0 = Xtr[ytr == 0] # class 0
c1 = Xtr[ytr == 1] # class 1
c2 = Xtr[ytr == 2] # class 2
c3 = Xtr[ytr == 3] # class 3
c4 = Xtr[ytr ==4] # class 4

In [None]:
from sklearn.model_selection import train_test_split
## train and test split according to the fix ratio for each class
Xtr_0, Xtst_0, ytr_0, ytst_0 = train_test_split(c0, ytr[ytr==0], test_size = 1/3, random_state = 42)
Xtr_1, Xtst_1, ytr_1, ytst_1 = train_test_split(c1, ytr[ytr==1], test_size = 2/3, random_state = 42)
Xtr_2, Xtst_2, ytr_2, ytst_2 = train_test_split(c2, ytr[ytr==2], test_size = 4/5, random_state = 42)
Xtr_3, Xtst_3, ytr_3, ytst_3 = train_test_split(c3, ytr[ytr==3], test_size = 3/4, random_state = 42)
Xtr_4, Xtst_4, ytr_4, ytst_4 = train_test_split(c4, ytr[ytr==4], test_size = 1/3, random_state = 42)

In [None]:
Xtr_new = np.concatenate((Xtr_0, Xtr_1, Xtr_2, Xtr_3, Xtr_4), axis = 0) # new training
# Xtr_new.shape
ytr_new = np.concatenate((ytr_0, ytr_1, ytr_2, ytr_3, ytr_4), axis = 0) # new training labels
# ytr_new.shape

In [None]:
Xtst_new = np.concatenate((Xtst_0, Xtst_1, Xtst_2, Xtst_3, Xtst_4), axis = 0) # new testing
ytst_new = np.concatenate((ytst_0, ytst_1, ytst_2, ytst_3, ytst_4), axis = 0) # new testing labels

### Implement Ordinal Classification Scenario 1

In [None]:
class OrdinalClassifierS1():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
                #Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

### Implement Ordinal Classification Scenario 2

In [None]:
class OrdinalClassifierS2():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
#                 Vi = (1-Pr(y > Vi-1))*Pr(y > Vi-1)
                 predicted.append((1-clfs_predict[y][:,1])*clfs_predict[y-1][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

### Implement Ordinal Classification Scenario 3

In [None]:
class OrdinalClassifierS3():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for y in self.unique_class:
            if y!=max(self.unique_class):
                predicted.append(clfs_predict[y][:, 0])
            else:
                predicted.append([1]*len(X))
        return np.vstack(predicted).T
    
    def predict(self, X):
        tmp = self.predict_proba(X)
        boo = tmp>=0.5
        return boo.argmax(axis = 1)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix  
from timeit import default_timer

### Run classfifiers LDA, QDA, GNB, Logistic Regression, Linear SVM, Random Forest, Adaboost, Neural Networks
- standard classification
- ordinal classification scenario 1, 2, 3
- Ordinal Linear SVM was too time-consuming and we only present standard Linear SVM
- Ordinal Adaboost was too time-consuming and we only present standard Adaboost and Ordinal 1 scenario

In [None]:
#### LDA
clf_r = LinearDiscriminantAnalysis() # standard LDA
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 LDA
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 LDA
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 LDA

LDA = [clf_r, clf_o1, clf_o2, clf_o3]

time_lda = [] # run time
accuracy_lda = [] # accuracy

for clf in LDA:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_lda.append(accuracy)
    time_lda.append(default_timer() - start_time)
    print(accuracy_lda)
    print(time_lda)

In [None]:
#### QDA
clf_r = QuadraticDiscriminantAnalysis() # standard QDA
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 QDA
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 QDA
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 QDA

QDA = [clf_r, clf_o1, clf_o2, clf_o3]

time_qda = [] # run time
accuracy_qda = [] # accuracy

for clf in QDA:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_qda.append(accuracy)
    time_qda.append(default_timer() - start_time)
    print(accuracy_qda)
    print(time_qda)

In [None]:
#### Guassian Naive Bayes
clf_r = GaussianNB() # standard GNB
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 GNB
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 GNB
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 GNB

GNB = [clf_r, clf_o1, clf_o2, clf_o3]

time_gnb = [] # run time
accuracy_gnb = [] # accuracy

for clf in GNB:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_gnb.append(accuracy)
    time_gnb.append(default_timer() - start_time)
    print(accuracy_gnb)
    print(time_gnb)

In [None]:
#### Guassian Naive Bayes
clf_r = GaussianNB() # standard GNB
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 GNB
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 GNB
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 GNB

GNB = [clf_r, clf_o1, clf_o2, clf_o3]

time_gnb = [] # run time
accuracy_gnb = [] # accuracy

for clf in GNB:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_gnb.append(accuracy)
    time_gnb.append(default_timer() - start_time)
    print(accuracy_gnb)
    print(time_gnb)

In [None]:
#### Logistic Regression
clf_r = LogisticRegression(C = 2e-5, solver = 'lbfgs', max_iter=500) # standard LR, C has been tuned and the optimal was used
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 LR
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 LR
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 LR

LR = [clf_r, clf_o1, clf_o2, clf_o3]

time_lr = [] # run time
accuracy_lr = [] # accuracy

for clf in LR:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_lr.append(accuracy)
    time_lr.append(default_timer() - start_time)
    print(accuracy_lr)
    print(time_lr)

In [None]:
#### Linear SVM, only standard, ordinal SVM was too time-consuming, we only present the standard version
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsOneClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

C = 2*np.logspace(-8, 5, 14) # tune regulazation C
accuracy_svm= []
time_svm = []

for i in C:
    start_time = default_timer()
    clf = OneVsOneClassifier(LinearSVC(C = i))
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy_svm.append(np.sum(predict == ytst_new)/len(ytst_new))
    time_svm.append(default_timer() - start_time)
    print(accuracy_svm)
    print(time_svm)

In [None]:
### Neural Networks
## regular NN found the highest accuracy = 0.6614927079333042,
## learning_rate_init=4.64158883e-02, and alpha = 1.66810054e-01

clf_r = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu', 
                        random_state=1, max_iter=1000,
                       learning_rate='constant',
                       learning_rate_init=4.64158883e-02,
                       alpha = 1.66810054e-01,
                       batch_size=200) # standard NN, parameters were tuned and the optimals were used
clf_o1 = OrdinalClassifierS1(clf_r) # ordinal 1 NN
clf_o2 = OrdinalClassifierS2(clf_r) # ordinal 2 NN
clf_o3 = OrdinalClassifierS3(clf_r) # ordinal 3 NN

NN = [clf_r, clf_o1, clf_o2, clf_o3]

time_nn = [] # run time
accuracy_nn = [] # accuracy

for clf in NN:
    start_time = default_timer()
    clf.fit(Xtr_new, ytr_new)
    predict = clf.predict(Xtst_new)
    accuracy = np.sum(predict == ytst_new)/len(ytst_new)
    accuracy_nn.append(accuracy)
    time_nn.append(default_timer() - start_time)
    print(accuracy_nn)
    print(time_nn)

In [None]:
### Random Forest
##ordinal RF 3 with 400 number trees found the highest accuracy = 0.6743

#standard RF
estimators=[50, 100, 150, 200, 250, 300, 350, 400, 450, 500]
score_per_tree=[]
for n in estimators:
    rf = OrdinalClassifier(RandomForestClassifier(n_estimators=n))
    rf.fit(X_train, y_train)
    pred=rf.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)
    
confusion_matrix(y_test, pred)


#ordinal RF 1
score_per_tree=[]
for n in estimators:
    rf = OrdinalClassifierS1(RandomForestClassifier(n_estimators=n))
    rf.fit(X_train, y_train)
    pred=rf.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)
    
#ordinal RF 2
score_per_tree=[]
for n in estimators:
    rf = OrdinalClassifierS2(RandomForestClassifier(n_estimators=n))
    rf.fit(X_train, y_train)
    pred=rf.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)

#ordinal RF 3
score_per_tree=[]
for n in estimators:
    rf = OrdinalClassifierS3(RandomForestClassifier(n_estimators=n))
    rf.fit(X_train, y_train)
    pred=rf.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)
    
confusion_matrix(y_test, pred)

In [None]:
#### Adaboost, only standard, and limited ordinal Adaboost
#### ordinal Adaboost was too time-consuming

#standard Adaboost
estimators=list(range(100, 501,50))
score_per_tree=[]
for n in estimators:
    ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(min_samples_leaf=1, max_depth=10),
                                              n_estimators=n)
    ab.fit(X_train, y_train)
    pred=ab.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)

#ordinal Adaboost 1
score_per_tree=[]
for n in estimators:
    ab = OrdinalClassifierS1(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(min_samples_leaf=1, max_depth=10),
                                              n_estimators=n))
    ab.fit(X_train, y_train)
    pred=ab.predict(X_test)
    score= np.sum(pred == y_test)/len(y_test)
    score_per_tree.append(score)
    print(score_per_tree)

### Exploration: here we tried to merge the classes (merged 0 and 1, and 3 and 4) and do ordinal3 LR classification on training data with 3 labels. Then we apply binary LR to the 0 and 1 classes, and 3 and 4 classes to evaluate the overall accuracy

In [None]:
#merge the negative and somewhat negative classes, and somewhat positive and positive classes
y_train2=np.array(y_train, copy=True)
y_test2=np.array(y_test, copy=True)
y_test3=np.array(y_test, copy=True)

for i in range(len(y_train2)):
    if y_train2[i]==1:
        y_train2[i]=0
    elif y_train2[i]==2:
        y_train2[i]=1
    elif y_train2[i]==3 or y_train2[i]==4:
        y_train2[i]=2

        
        
for i in range(len(y_test3)):
    if y_test3[i]==1:
        y_test3[i]=0
    elif y_test3[i]==2:
        y_test3[i]=1
    elif y_test3[i]==3 or y_test3[i]==4:
        y_test3[i]=2

In [None]:
##train and fit the ordinal3 LR for merged 3 classes data
##we tuned the C and C=2e-4 gave us the best accuracy
lr = OrdinalClassifierS3(LogisticRegression(C= 2e-4, solver="lbfgs", max_iter=500))
lr.fit(X_train, y_train2)
pred=lr.predict(X_test)
score= np.sum(pred == y_test3)/len(y_test3)
print(score)  #0.7161140197789413, higher than ordinal3 LR on 5 labels

In [None]:
##train the binary LR model for class 0 and 1
##again, C=2e-4 gave us the best accuracy
lr01=LogisticRegression(C= 2e-4, solver="lbfgs", max_iter=500)
index01=y_train2==0
lr01.fit(X_train[index01], y_train[index01])

In [None]:
##train the binary LR model for class 3 and 4
##again, C=2e-4 gave us the best accuracy
lr34=LogisticRegression(C= 2e-4, solver="lbfgs", max_iter=500)
index34=y_train2==2
lr34.fit(X_train[index34], y_train[index34])

In [None]:
#get the index for corresponding class predictions for test data
pred2_index=pred==1
pred01_index=pred==0
pred34_index=pred==2

#fit binary LR on test data
pred01=lr01.predict(X_test[pred01_index])
pred34=lr34.predict(X_test[pred34_index])

#create the final predicted label
y_test2[pred2_index]=2
y_test2[pred01_index]=pred01
y_test2[pred34_index]=pred34

#calculate the overall accuracy
score= np.sum(y_test2 == y_test)/len(y_test)
print(score) #0.6377142345728734, the accuracy is lower than standard LR and all evaluated ordinal LR

### RNN

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
#load the data
test_directory = 'sentiment-analysis-on-movie-reviews/test.tsv/test.tsv'
train_directory='sentiment-analysis-on-movie-reviews/train.tsv'
test_raw= pd.read_csv(test_directory, sep='\t')  
train_raw=pd.read_csv(train_directory, sep='\t')  
train_label=train_raw["Sentiment"]
#drop unecessary columns
train_raw.drop(['PhraseId','SentenceId'],inplace = True,axis='columns')


#convert sentences to tokenized words
for i in range(len(train_raw['Phrase'])):
    train_raw['Phrase'][i] = text_to_word_sequence(train_raw['Phrase'][i])
    

#convert tokenized words to numeric form required for model building
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_raw['Phrase'])

train_raw['Phrase'] = tokenizer.texts_to_sequences(train_raw['Phrase'])

#convert each tokenized review into an input of the same length = 100 by padding with 0s in the begining
max_length = 100
train_copy = train_raw['Phrase']
train_copy = pad_sequences(train_raw['Phrase'],maxlen=max_length)
vocab_size = len(tokenizer.word_index) + 1

X = train_copy
y =np.array(train_raw['Sentiment'])

In [None]:
#resamplep to downsize the training data
index0=np.where(y==0)
y0=y[index0]
x0=X[index0]
X_train_0, X_test_0, y_train_0, y_test_0 = train_test_split(x0, y0, test_size=0.33, random_state=42)

index1=np.where(y==1)
y1=y[index1]
x1=X[index1]
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(x1, y1, test_size=0.66, random_state=42)

index2=np.where(y==2)
y2=y[index2]
x2=X[index2]
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(x2, y2, test_size=0.8, random_state=42)

index3=np.where(y==3)
y3=y[index3]
x3=X[index3]
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(x3, y3, test_size=0.75, random_state=42)

index4=np.where(y==4)
y4=y[index4]
x4=X[index4]
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(x4, y4, test_size=0.33, random_state=42)

#concatenate the new training set labels
y_train=np.vstack((y_train_0.reshape((4738,1)),y_train_1.reshape((9272,1))))
y_train=np.vstack((y_train,y_train_2.reshape((15916,1))))
y_train=np.vstack((y_train,y_train_3.reshape((8231,1))))
y_train=np.vstack((y_train,y_train_4.reshape(6168,1)))
y_train=y_train.flatten()

#concatenate the new setting set data
X_train=np.vstack((X_train_0,X_train_1))
X_train=np.vstack((X_train,X_train_2))
X_train=np.vstack((X_train,X_train_3))
X_train=np.vstack((X_train,X_train_4))

#concatenate the new testing set labels
y_test=np.vstack((y_test_0.reshape((2334,1)),y_test_1.reshape((18001,1))))
y_test=np.vstack((y_test,y_test_2.reshape((63666,1))))
y_test=np.vstack((y_test,y_test_3.reshape((24696,1))))
y_test=np.vstack((y_test,y_test_4.reshape((3038,1))))
y_test=y_test.flatten()

#concatenate the new testing set data
X_test=np.vstack((X_test_0,X_test_1))
X_test=np.vstack((X_test,X_test_2))
X_test=np.vstack((X_test,X_test_3))
X_test=np.vstack((X_test,X_test_4))

#turn the label into one hot code
y_train_hot = np.zeros((y_train.size, y_train.max()+1))
y_train_hot[np.arange(y_train.size),y_train] = 1

y_test_hot = np.zeros((y_test.size, y_test.max()+1))
y_test_hot[np.arange(y_test.size),y_test] = 1

In [None]:
#below model design produced the best accuracy
model2 = Sequential()
model2.add(Embedding(input_dim=vocab_size, 
                    output_dim=embedding_vector_length, 
                    input_length=max_length))
model2.add(LSTM(100))
model2.add(Dropout(0.5))
model2.add(Dense(5,activation = 'softmax'))
model2.compile(loss = 'categorical_crossentropy',
                        optimizer = 'adam',
                        metrics=['accuracy'])
model2.summary()

#fit and test the model
train_history=model2.fit(x=X_train,y=y_train_hot,batch_size=64,epochs=10,
                         verbose=2,validation_data=(X_test,y_test_hot))

### CNN

In [None]:
import re
import string 
import nltk
from nltk import word_tokenize
import gensim
from keras.layers import Dense, Dropout, Reshape, Flatten, concatenate, Input, Conv1D, GlobalMaxPooling1D, Embedding
from keras.models import Model

In [None]:
#load the data
test_directory = 'sentiment-analysis-on-movie-reviews/test.tsv/test.tsv'
train_directory='sentiment-analysis-on-movie-reviews/train.tsv'
test_raw= pd.read_csv(test_directory, sep='\t')  
train_raw=pd.read_csv(train_directory, sep='\t')  
train_label=train_raw["Sentiment"]

#remove punctuation
def remove_punct(text):
    text_nopunct = ''
    text_nopunct = re.sub('['+string.punctuation+']', '', text)
    return text_nopunct
train_raw['Text_Clean'] = train_raw['Phrase'].apply(lambda x: remove_punct(x))

#Tokenize
#nltk.download('punkt')
tokens = [word_tokenize(sen) for sen in train_raw.Text_Clean]

#lower case all tokens
def lower_token(tokens): 
    return [w.lower() for w in tokens]    
    
lower_tokens = [lower_token(token) for token in tokens]

train_raw['Text_Final'] = [' '.join(sen) for sen in lower_tokens]
train_raw['tokens'] = lower_tokens

In [None]:
#we add five one hot encoded columns to our data frame, corresponding to the 5 classes
neg=[]
som_neg=[]
neu=[]
som_pos=[]
pos = []

for l in train_raw.Sentiment:
    if l == 0:
        neg.append(1)
        som_neg.append(0)
        neu.append(0)
        som_pos.append(0)
        pos.append(0)
    elif l == 1:
        neg.append(0)
        som_neg.append(1)
        neu.append(0)
        som_pos.append(0)
        pos.append(0)
    elif l==2:
        neg.append(0)
        som_neg.append(0)
        neu.append(1)
        som_pos.append(0)
        pos.append(0)
    elif l==3:
        neg.append(0)
        som_neg.append(0)
        neu.append(0)
        som_pos.append(1)
        pos.append(0)
    elif l==4:
        neg.append(0)
        som_neg.append(0)
        neu.append(0)
        som_pos.append(0)
        pos.append(1)
        
train_raw['neg']= neg
train_raw['som_neg']= som_neg
train_raw['neu']=neu
train_raw['som_pos']= som_pos
train_raw['pos']= pos

train_raw = train_raw[['Text_Final', 'tokens', 'Sentiment', 'neg', 'som_neg','neu','som_pos','pos']]
train_raw.head()

In [None]:
#resample to downsize the training data
df_train0, df_test0 = train_test_split(
  train_raw.loc[train_raw['Sentiment'] == 0],
  test_size=0.33,
  random_state=42
)

df_train1, df_test1 = train_test_split(
  train_raw.loc[train_raw['Sentiment'] == 1],
  test_size=0.66,
  random_state=42
)

df_train2, df_test2 = train_test_split(
  train_raw.loc[train_raw['Sentiment'] == 2],
  test_size=0.8,
  random_state=42
)

df_train3, df_test3 = train_test_split(
  train_raw.loc[train_raw['Sentiment'] == 3],
  test_size=0.75,
  random_state=42
)

df_train4, df_test4 = train_test_split(
  train_raw.loc[train_raw['Sentiment'] == 4],
  test_size=0.33,
  random_state=42
)

#concatenating
data_train=pd.concat([df_train0, df_train1,df_train2,df_train3,df_train4])
data_test=pd.concat([df_test0, df_test1,df_test2,df_test3,df_test4])

In [None]:
##get maximum training sentence length
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

##get maximum testing sentence length
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

In [None]:
##load word2vec
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

#get embeddings
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [None]:
#Tokenize and Pad sequences
MAX_SEQUENCE_LENGTH=50
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Text_Final"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Text_Final"].tolist())
train_word_index = tokenizer.word_index
print("Found %s unique tokens." % len(train_word_index))
train_cnn_data = pad_sequences(training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)

#get the initial embedding weights
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

#determine the running sequences
test_sequences = tokenizer.texts_to_sequences(data_test["Text_Final"].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
#Now we will get embeddings from Google News Word2Vec model and save them corresponding to the sequence number 
#we assigned to each word. If we could not get embeddings we save a random vector for that word.

EMBEDDING_DIM = 300
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
    print(train_embedding_weights.shape)

In [None]:
#Text as a sequence is passed to a CNN. The embeddings matrix is passed to embedding_layer. 
#Five different filter sizes are applied to each comment, and GlobalMaxPooling1D layers are applied to each layer. 
#All the outputs are then concatenated. A Dropout layer then Dense then Dropout and then Final Dense layer is applied.
#model.summary() will print a brief summary of all the layers with there output shapes.

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
 
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    convs = []
    filter_sizes = [2,3,4,5,6]
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, 
                        kernel_size=filter_size, 
                        activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs, axis=1)
    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)
    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

label_names = ['pos', 'som_pos','neu','som_neg','neg']
model = ConvNet(train_embedding_weights, 
                MAX_SEQUENCE_LENGTH, 
                len(train_word_index)+1, 
                EMBEDDING_DIM, 
                len(list(label_names)))

In [None]:
x_train = train_cnn_data
y_train = data_train[label_names].values
y_tr=y_train

#train CNN
num_epochs = 3
batch_size = 32
hist = model.fit(x_train, 
                 y_tr, 
                 epochs=num_epochs, 
                 validation_split=0.1, 
                 shuffle=True, 
                 batch_size=batch_size)

#test CNN
predictions = model.predict(test_cnn_data, batch_size=1024, verbose=1)
labels = [0,1,2,3,4]
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

sum(data_test.Sentiment==prediction_labels)/len(prediction_labels)