In [132]:
import numpy as np
import matplotlib.pyplot as plt
import pandas 
from numpy.linalg import inv
import random
import csv
import math
import sklearn
from sklearn.svm import LinearSVC
from collections import Counter
import string
import operator
from scipy.sparse import csr_matrix
from scipy.sparse import vstack
from scipy.sparse import hstack
from sklearn.naive_bayes import BernoulliNB
import scipy.sparse as sps
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score 
from sklearn.dummy import DummyClassifier

In [133]:
yelp_train_x = []
yelp_train_y = []
yelp_valid_x = []
yelp_valid_y = []
yelp_test_x = []
yelp_test_y = []

translation = str.maketrans("", "", string.punctuation)


with open('./yelp-train.txt') as f:
    for line in f:
        yelp_train_x.append(line[:-3].translate(translation).lower())
        yelp_train_y.append(int(line[-2]))
        
with open('./yelp-valid.txt') as f:
    for line in f:
        yelp_valid_x.append(line[:-3].translate(translation).lower())
        yelp_valid_y.append(int(line[-2]))
        
with open('./yelp-test.txt') as f:
    for line in f:
        yelp_test_x.append(line[:-3].translate(translation).lower())
        yelp_test_y.append(int(line[-2]))
        
wordfreq = {}

for row in yelp_train_x:
    words = row.split(" ")
    j = 0
    for word in words:
        if word == ' ' or word == '':
            continue
        if word not in wordfreq:
            wordfreq[word] = (j,1)
            j += 1
        else:
            wordfreq[word] = (j, wordfreq[word][1]+ 1)
    
    

sorted_wordfreq = sorted(wordfreq.items(), key=lambda x:x[1], reverse = True)[:10000]  
sorted_wordfreq_dict = dict(sorted_wordfreq)


In [134]:

def bag_of_words_represntation(data, sorted_wordfreq_dict):
    DM_binary = []
    DM_frequency = []
    for x in data:
        vector_binary = np.zeros(len(sorted_wordfreq_dict))
        vector_frequency = np.zeros(len(sorted_wordfreq_dict))
        for y in x.split(' '):
            if y in sorted_wordfreq_dict:
                index = sorted_wordfreq_dict[y][0]
                if (vector_binary[index] == 0):
                    vector_binary[index] = 1
                vector_frequency[index] += 1                
        DM_binary.append(vector_binary)
        if (sum(vector_frequency) != 0):
            vector_frequency = vector_frequency/sum(vector_frequency)
        
        DM_frequency.append(vector_frequency)   
    return DM_frequency, DM_binary

train_frequency_design, train_binary_design = bag_of_words_represntation(yelp_train_x, sorted_wordfreq_dict)
valid_frequency_design, valid_binary_design = bag_of_words_represntation(yelp_valid_x, sorted_wordfreq_dict)
test_frequency_design, test_binary_design = bag_of_words_represntation(yelp_test_x, sorted_wordfreq_dict)

In [135]:
yelp_vocab = []
for i in range(0,len(sorted_wordfreq)):
    yelp_vocab.append([i, sorted_wordfreq[i][0], sorted_wordfreq[i][1]])
yelp_train = []
for i in range(0,len(yelp_train_x)):
    toadd = []
    for j in yelp_train_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(yelp_train_y[i])
    yelp_train.append(toadd)        
yelp_valid = []
for i in range(0,len(yelp_valid_x)):
    toadd = []
    for j in yelp_valid_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(yelp_valid_y[i])
    yelp_valid.append(toadd)
yelp_test = []
for i in range(0,len(yelp_test_x)):
    toadd = []
    for j in yelp_test_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(yelp_test_y[i])
    yelp_test.append(toadd)


In [136]:


########hyper parameter tuning######

##BernoulliNB alpha tuning##

clf = BernoulliNB()
clf.fit(train_binary_design, yelp_train_y)


parameters = []
for i in range(0,100):
    parameters.append(math.pow(10,(i-90)))
best_f1 = -10
k = 0
for i in parameters:
    clf.alpha = i
    f1_measure = f1_score(yelp_valid_y,clf.predict(valid_binary_design), average='micro')
    if (best_f1 < f1_measure):
        k = i
        best_f1 = f1_measure
clf.alpha = k   

print("The f1_measure on test set for bernoulli naive bayes using binary bag of words is :" + str(f1_score(y_true = yelp_test_y, y_pred = clf.predict(test_binary_design), average = 'micro')))
print("Corresponding alpha is " + str(clf.alpha))

####Decision tree max leaf, min sample max depth training



decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_binary_design, yelp_train_y)


max_features = [i for i in range(1,20)]
min_samples_leaf = [i for i in range(1,20)]
max_depth = [i for i in range(1,20)]
best_f1_2 = -10
best_i = 0
best_j = 0
best_k = 0 
for i in max_features:
    for j in min_samples_leaf:
        for k in max_depth:
            decision_tree.max_features = i
            decision_tree.min_samples_leaf = j
            decision_tree.max_depth = k
            f1_measure = f1_score(yelp_valid_y,decision_tree.predict(valid_binary_design), average='micro')
            if (best_f1_2 < f1_measure):
                best_i = i
                best_j = j
                best_k = k 
                best_f1_2 = f1_measure
decision_tree.max_features = best_i
decision_tree.min_samples_leaf = best_j
decision_tree.max_depth = best_k
print("The f1_measure on test set for classification tree using binary bag of words is :" + str(f1_score(y_true = yelp_test_y, y_pred = decision_tree.predict(test_binary_design), average = 'micro')))
print("Corresponding max_features, min_samples_leaf, max_depth are " + str(best_i) +
      ", " + str(best_j)+ ", " +str(best_k))

####Decision tree max leaf, min sample max depth training 
      
lin_clf = LinearSVC()
lin_clf.fit(train_binary_design, yelp_train_y)


C = []
for i in range(0,100):
    C.append(math.pow(10,(i-90)))

best_f1_3 = -10
k=0
for i in C:
    lin_clf.C = i
    f1_measure = f1_score(yelp_valid_y,lin_clf.predict(valid_binary_design), average='micro')
    if (best_f1_3 < f1_measure):
        k = i
        best_f1_3 = f1_measure
lin_clf.C = k    


print("The f1_measure on test set for LinearSVC using binary bag of words is :" + str(f1_score(yelp_test_y, lin_clf.predict(test_binary_design), average = 'micro')))
print("The corresponding vlaue of C is " + str(lin_clf.C))





uniform_classifier = DummyClassifier(strategy='uniform')
uniform_classifier.fit(train_binary_design, yelp_train_y)
print("The f1_measure on test set for Random classifier is :" + str(f1_score(yelp_test_y,uniform_classifier.predict(test_binary_design), average = 'micro')))


majority_classifier = DummyClassifier(strategy='most_frequent') 
majority_classifier.fit(train_binary_design, yelp_train_y)

print("The f1_measure on test set for majority-class classifier is :" + str(f1_score(yelp_test_y, majority_classifier.predict(test_binary_design), average = 'micro')))





The f1_measure on test set for bernoulli naive bayes using binary bag of words is :0.37
Corresponding alpha is 1e-90
The f1_measure on test set for classification tree using binary bag of words is :0.2975
Corresponding max_features, min_samples_leaf, max_depth are 1, 1, 1
The f1_measure on test set for LinearSVC using binary bag of words is :0.36250000000000004
The corresponding vlaue of C is1e-90
The f1_measure on test set for Random classifier is :0.2025
The f1_measure on test set for majority-class classifier is :0.351


In [138]:
 

########hyper parameter tuning######

##BernoulliNB alpha tuning##

clf = BernoulliNB()
clf.fit(train_frequency_design, yelp_train_y)


parameters = []
for i in range(0,100):
    parameters.append(math.pow(10,(i-90)))
best_f1 = -10
k = 0
for i in parameters:
    clf.alpha = i
    f1_measure = f1_score(yelp_valid_y,clf.predict(valid_frequency_design), average='micro')
    if (best_f1 < f1_measure):
        k = i
        best_f1 = f1_measure
clf.alpha = k   


print("The f1_measure on test set for bernoulli naive bayes using frequency bag of words is :"
      + str(f1_score(y_true = yelp_test_y, y_pred = clf.predict(test_frequency_design), average = 'micro')))
print("Corresponding alpha is " + str(clf.alpha))


####Decision tree max leaf, min sample max depth training



decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_frequency_design, yelp_train_y)


max_features = [i for i in range(1,20)]
min_samples_leaf = [i for i in range(1,20)]
max_depth = [i for i in range(1,20)]
best_f1_2 = -10
best_i = 0
best_j = 0
best_k = 0 
for i in max_features:
    for j in min_samples_leaf:
        for k in max_depth:
            decision_tree.max_features = i
            decision_tree.min_samples_leaf = j
            decision_tree.max_depth = k
            f1_measure = f1_score(yelp_valid_y,decision_tree.predict(valid_frequency_design), average='micro')
            if (best_f1_2 < f1_measure):
                best_i = i
                best_j = j
                best_k = k 
                best_f1_2 = f1_measure
decision_tree.max_features = best_i
decision_tree.max_features = best_j
decision_tree.max_features = best_k 
print("The f1_measure on test set for classification tree using frequency bag of words is :"
      + str(f1_score(y_true = yelp_test_y, y_pred = decision_tree.predict(test_frequency_design), average = 'micro')))
print("Corresponding max_features, min_samples_leaf, max_depth are " 
      + str(best_i) + ", " + str(best_j)+ ", " +str(best_k))


####Decision tree max leaf, min sample max depth training 

lin_clf = LinearSVC()
lin_clf.fit(train_frequency_design, yelp_train_y)


C = []
for i in range(0,100):
    C.append(math.pow(10,(i-90)))

best_f1_3 = -10
k=0
for i in C:
    lin_clf.C = i
    f1_measure = f1_score(yelp_valid_y,lin_clf.predict(valid_frequency_design), average='micro')
    if (best_f1_3 < f1_measure):
        k = i
        best_f1_3 = f1_measure
lin_clf.C = k    



print("The f1_measure on test set for LinearSVC using frequency bag of words is :"
      + str(f1_score(yelp_test_y, lin_clf.predict(test_frequency_design), average = 'micro')))
print("The corresponding vlaue of C is " + str(lin_clf.C))



uniform_classifier = DummyClassifier(strategy='uniform')
uniform_classifier.fit(train_binary_design, yelp_train_y)
print("The f1_measure on test set for Random classifier is :" 
      + str(f1_score(yelp_test_y,uniform_classifier.predict(test_frequency_design), average = 'micro')))


majority_classifier = DummyClassifier(strategy='most_frequent') 
majority_classifier.fit(train_frequency_design, yelp_train_y)

print("The f1_measure on test set for majority-class classifier is : " + 
      str(f1_score(yelp_test_y, majority_classifier.predict(test_frequency_design), average = 'micro')))







The f1_measure on test set for bernoulli naive bayes using frequency bag of words is :0.37
Corresponding alpha is 1e-90
The f1_measure on test set for classification tree using frequency bag of words is :0.275
Corresponding max_features, min_samples_leaf, max_depth are 1, 1, 1
The f1_measure on test set for LinearSVC using frequency bag of words is :0.37
The corresponding vlaue of C is1e-90
The f1_measure on test set for Random classifier is :0.20000000000000004
The f1_measure on test set for majority-class classifier is :0.351


In [139]:
IMDB_train_x = []
IMDB_train_y = []
IMDB_valid_x = []
IMDB_valid_y = []
IMDB_test_x = []
IMDB_test_y = []

translation = str.maketrans("", "", string.punctuation)


with open('./IMDB-train.txt') as f:
    for line in f:
        IMDB_train_x.append(line[:-3].translate(translation).lower())
        IMDB_train_y.append(int(line[-2]))
        
with open('./IMDB-valid.txt') as f:
    for line in f:
        IMDB_valid_x.append(line[:-3].translate(translation).lower())
        IMDB_valid_y.append(int(line[-2]))
        
with open('./IMDB-test.txt') as f:
    for line in f:
        IMDB_test_x.append(line[:-3].translate(translation).lower())
        IMDB_test_y.append(int(line[-2]))

        
        

In [140]:

wordfreq = {}
for row in IMDB_train_x:
    words = row.split(" ")
    j = 0
    for word in words:
        if word == ' ' or word == '':
            continue
        if word not in wordfreq:
            wordfreq[word] = (j,1)
            j += 1
        else:
            wordfreq[word] = (j, wordfreq[word][1]+ 1)
    
sorted_wordfreq = sorted(wordfreq.items(), key=lambda x:x[1], reverse = True)[:10000]  
sorted_wordfreq_dict = dict(sorted_wordfreq)

            



In [141]:

def bag_of_words_represntation(data, sorted_wordfreq_dict):
    DM_binary = []
    DM_frequency = []
    for x in data:
        vector_binary = np.zeros(len(sorted_wordfreq_dict))
        vector_frequency = np.zeros(len(sorted_wordfreq_dict))
        for y in x.split(' '):
            if y in sorted_wordfreq_dict:
                index = sorted_wordfreq_dict[y][0]
                if (vector_binary[index] == 0):
                    vector_binary[index] = 1
                vector_frequency[index] += 1                
        DM_binary.append(vector_binary)
        if (sum(vector_frequency) != 0):
            vector_frequency = vector_frequency/sum(vector_frequency)
        
        DM_frequency.append(vector_frequency)   
    return csr_matrix(DM_frequency), csr_matrix(DM_binary)

train_frequency_design, train_binary_design = bag_of_words_represntation(IMDB_train_x, sorted_wordfreq_dict)
valid_frequency_design, valid_binary_design = bag_of_words_represntation(IMDB_valid_x, sorted_wordfreq_dict)
test_frequency_design, test_binary_design = bag_of_words_represntation(IMDB_test_x, sorted_wordfreq_dict)

In [142]:
IMDB_vocab = []
for i in range(0,len(sorted_wordfreq)):
    IMDB_vocab.append([i, sorted_wordfreq[i][0], sorted_wordfreq[i][1]])
IMDB_train = []
for i in range(0,len(IMDB_train_x)):
    toadd = []
    for j in IMDB_train_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(IMDB_train_y[i])
    IMDB_train.append(toadd)        
IMDB_valid = []
for i in range(0,len(IMDB_valid_x)):
    toadd = []
    for j in IMDB_valid_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(IMDB_valid_y[i])
    IMDB_valid.append(toadd)
IMDB_test = []
for i in range(0,len(IMDB_test_x)):
    toadd = []
    for j in IMDB_test_x[i].split(' '):
        if j in sorted_wordfreq_dict:
            toadd.append(sorted_wordfreq_dict[j][0])
    toadd.append(IMDB_test_y[i])
    IMDB_test.append(toadd)

In [145]:



########hyper parameter tuning######

##BernoulliNB alpha tuning##

clf = BernoulliNB()
clf.fit(train_binary_design, IMDB_train_y)


parameters = []
for i in range(0,100):
    parameters.append(math.pow(10,(i-90)))
best_f1 = -10
k = 0
for i in parameters:
    clf.alpha = i
    f1_measure = f1_score(IMDB_valid_y,clf.predict(valid_binary_design), average='micro')
    if (best_f1 < f1_measure):
        k = i
        best_f1 = f1_measure
clf.alpha = k   

print("The f1_measure on test set for bernoulli naive bayes using binary bag of words is :" 
      + str(f1_score(y_true =IMDB_test_y, y_pred = clf.predict(test_binary_design), average = 'micro')))
print("Corresponding alpha is " + str(clf.alpha))


####Decision tree max leaf, min sample max depth training



decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_binary_design, IMDB_train_y)


max_features = [i for i in range(1,20)]
min_samples_leaf = [i for i in range(1,20)]
max_depth = [i for i in range(1,20)]
best_f1_2 = -10
best_i = 0
best_j = 0
best_k = 0 
for i in max_features:
    for j in min_samples_leaf:
        for k in max_depth:
            decision_tree.max_features = i
            decision_tree.min_samples_leaf = j
            decision_tree.max_depth = k
            f1_measure = f1_score(IMDB_valid_y,decision_tree.predict(valid_binary_design), average='micro')
            if (best_f1_2 < f1_measure):
                best_i = 0
                best_j = 0
                best_k = 0 
                best_f1_2 = f1_measure
decision_tree.max_features = best_i
decision_tree.max_features = best_j
decision_tree.max_features = best_k 
print("The f1_measure on test set for classification tree using binary bag of words is :" 
      + str(f1_score(y_true = IMDB_test_y, y_pred = decision_tree.predict(test_binary_design), average = 'micro')))
print("Corresponding max_features, min_samples_leaf, max_depth are " 
      + str(best_i) + ", " + str(best_j)+ ", " +str(best_k))


####Decision tree max leaf, min sample max depth training 

lin_clf = LinearSVC()
lin_clf.fit(train_binary_design, IMDB_train_y)


C = []
for i in range(0,100):
    C.append(math.pow(10,(i-90)))

best_f1_3 = -10
k=0
for i in C:
    lin_clf.C = i
    f1_measure = f1_score(IMDB_valid_y,lin_clf.predict(valid_binary_design), average='micro')
    if (best_f1_3 < f1_measure):
        k = i
        best_f1_3 = f1_measure
lin_clf.C = k    


print("The f1_measure on test set for LinearSVC using binary bag of words is :" 
      + str(f1_score(IMDB_test_y, lin_clf.predict(test_binary_design), average = 'micro')))
print("The corresponding vlaue of C is " + str(lin_clf.C))






uniform_classifier = DummyClassifier(strategy='uniform')
uniform_classifier.fit(train_binary_design, IMDB_train_y)
print("The f1_measure on test set for Random classifier is :" 
      + str(f1_score(IMDB_test_y,uniform_classifier.predict(test_binary_design), average = 'micro')))

uniform_classifier.predict(test_binary_design)

majority_classifier = DummyClassifier(strategy='most_frequent') 
majority_classifier.fit(train_binary_design, IMDB_train_y)

print("The f1_measure on test set for majority-class classifier is :" 
      + str(f1_score(IMDB_test_y, majority_classifier.predict(test_binary_design), average = 'micro')))





The f1_measure on test set for bernoulli naive bayes using binary bag of words is :0.52208
Corresponding alpha is 1e-90
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
The f1_measure on test set for classification tree using binary bag of words is :0.51912
Corresponding max_features, min_samples_leaf, max_depth are 0, 0, 0
The f1_measure on test set for LinearSVC using binary bag of words is :0.5226
The corresponding vlaue of C is 1e-90
The f1_measure on test set for Random classifier is :0.50624
The f1_measure on test set for majority-class classifier is :0.5


In [146]:



########hyper parameter tuning######

##BernoulliNB alpha tuning##

clf = BernoulliNB()
clf.fit(train_frequency_design, IMDB_train_y)


parameters = []
for i in range(0,100):
    parameters.append(math.pow(10,(i-90)))
best_f1 = -10
k = 0
for i in parameters:
    clf.alpha = i
    f1_measure = f1_score(IMDB_valid_y,clf.predict(valid_frequency_design), average='micro')
    if (best_f1 < f1_measure):
        k = i
        best_f1 = f1_measure
clf.alpha = k   

print("The f1_measure on test set for bernoulli naive bayes using frequency bag of words is :" + 
      str(f1_score(y_true =IMDB_test_y, y_pred = clf.predict(test_frequency_design), average = 'micro')))
print("Corresponding alpha is " + str(clf.alpha))


####Decision tree max leaf, min sample max depth training



decision_tree = tree.DecisionTreeClassifier()
decision_tree.fit(train_frequency_design, IMDB_train_y)


max_features = [i for i in range(1,20)]
min_samples_leaf = [i for i in range(1,20)]
max_depth = [i for i in range(1,20)]
best_f1_2 = -10
q = [] 
for i in max_features:
    for j in min_samples_leaf:
        for k in max_depth:
            decision_tree.max_features = i
            decision_tree.min_samples_leaf = j
            decision_tree.max_depth = k
            f1_measure = f1_score(IMDB_valid_y,decision_tree.predict(valid_frequency_design), average='micro')
            if (best_f1_2 < f1_measure):
                best_i = 0
                best_j = 0
                best_k = 0 
                best_f1_2 = f1_measure
decision_tree.max_features = best_i
decision_tree.max_features = best_j
decision_tree.max_features = best_k 
print("The f1_measure on test set for classification tree using frequency bag of words is :" 
      + str(f1_score(y_true = IMDB_test_y, y_pred = decision_tree.predict(test_frequency_design), average = 'micro')))
print("Corresponding max_features, min_samples_leaf, max_depth are " 
      + str(best_i) + ", " + str(best_j)+ ", " +str(best_k))


####Decision tree max leaf, min sample max depth training 

lin_clf = LinearSVC()
lin_clf.fit(train_binary_design, IMDB_train_y)


C = []
for i in range(0,100):
    C.append(math.pow(10,(i-90)))

best_f1_3 = -10
k=0
for i in C:
    lin_clf.C = i
    f1_measure = f1_score(IMDB_valid_y,lin_clf.predict(valid_frequency_design), average='micro')
    if (best_f1_3 < f1_measure):
        k = i
        best_f1_3 = f1_measure
lin_clf.C = k    


print("The f1_measure on test set for LinearSVC using frequency bag of words is :" + 
      str(f1_score(IMDB_test_y, lin_clf.predict(test_frequency_design), average = 'micro')))
print("The corresponding vlaue of C is " + str(lin_clf.C))






uniform_classifier = DummyClassifier(strategy='uniform')
uniform_classifier.fit(train_frequency_design, IMDB_train_y)
print("The f1_measure on test set for Random classifier is :" + 
      str(f1_score(IMDB_test_y,uniform_classifier.predict(test_frequency_design), average = 'micro')))


majority_classifier = DummyClassifier(strategy='most_frequent') 
majority_classifier.fit(train_frequency_design, IMDB_train_y)

print("The f1_measure on test set for majority-class classifier is :" + 
      str(f1_score(IMDB_test_y, majority_classifier.predict(test_frequency_design), average = 'micro')))
















The f1_measure on test set for bernoulli naive bayes using binary bag of words is :0.52208
Corresponding alpha is 1e-90
The f1_measure on test set for classification tree using binary bag of words is :0.5198
Corresponding max_features, min_samples_leaf, max_depth are 0, 0, 0
The f1_measure on test set for LinearSVC using binary bag of words is :0.50492
The corresponding vlaue of C is 1e-90
The f1_measure on test set for Random classifier is :0.50224
The f1_measure on test set for majority-class classifier is :0.5
