In [0]:

## load all the given datasets
from string import punctuation

with open("./hwk3_datasets/yelp-train.txt","r") as file:
    ye_train = file.readlines()
with open("./hwk3_datasets/yelp-valid.txt","r") as file:
    ye_valid = file.readlines()
with open("./hwk3_datasets/yelp-test.txt","r") as file:
    ye_test = file.readlines()
with open("./hwk3_datasets/IMDB-train.txt","r") as file:  ## here we will focus on the IMDB dataset. It is similar for yelp dataset
    im_train = file.readlines()
with open("./hwk3_datasets/IMDB-valid.txt","r") as file:
    im_valid = file.readlines()
with open("./hwk3_datasets/IMDB-test.txt","r") as file:
    im_test = file.readlines()

## lists to store the features and target variables
im_train_f = []
im_valid_f = []
im_test_f = []
im_train_t = []
im_valid_t = []
im_test_t = []

for i in im_train:                   
    s = i.rsplit(None,1)[0].lower()     ## convert the string to lowercase
    im_train_f.append(''.join([l for l in s if l not in punctuation]))  ## append in the feature list, without punctuation
    im_train_t.append(int(i.rsplit(None,1)[1]))      ## append in the target var list, in integer form
for i in im_valid:
    s = i.rsplit(None,1)[0].lower()
    im_valid_f.append(''.join([l for l in s if l not in punctuation]))
    im_valid_t.append(int(i.rsplit(None,1)[1]))
for i in im_test:
    s = i.rsplit(None,1)[0].lower()
    im_test_f.append(''.join([l for l in s if l not in punctuation]))
    im_test_t.append(int(i.rsplit(None,1)[1]))

In [0]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

## function that return a list of words in descending frequency, each tuple = (word, # of occurrence)
def top_n_words(corpus,n):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x:x[1], reverse=True)
    return words_freq[:n]

## the two feature sets for yelp and IMDB
im_feature = top_n_words(im_train_f,10000)   ## top 10,000 words in IMDB, with frequency in each tuple

vocab_im = []
for t in im_feature:
    vocab_im.append(t[0])

In [0]:
## the vec_XX_XXX_bin arrays are the desired binary representation
vec_im = CountVectorizer()
vec_im_bin = CountVectorizer(binary=True)
Y = vec_im.fit_transform(vocab_im)
Y_bin = vec_im_bin.fit_transform(vocab_im)
vec_im_train = vec_im.transform(im_train_f).toarray()
vec_im_train_bin = vec_im_bin.transform(im_train_f).toarray()
vec_im_valid = vec_im.transform(im_valid_f).toarray()
vec_im_valid_bin = vec_im_bin.transform(im_valid_f).toarray()
vec_im_test = vec_im.transform(im_test_f).toarray()
vec_im_test_bin = vec_im_bin.transform(im_test_f).toarray()

In [0]:
## function to calculate frequency bag of words
def freq(arr):
    array = []
    for i in arr:
        s = i.sum()
        if s == 0:
            array.append(np.zeros(arr.shape[1]))
        else:
            array.append(i.astype(float)/s)
    return np.asarray(array)

## now creaet the Frequency BoW representation, as np arrays.
freq_im_train = freq(vec_im_train)
freq_im_valid = freq(vec_im_valid)
freq_im_test = freq(vec_im_test)


In [0]:
words_im = vec_im.get_feature_names()
id_im = np.arange(10000)
fre_im = np.sum(vec_im_train,axis=0)

In [0]:
voc_im = np.c_[words_im,id_im,fre_im]

## save the vocab.txt files
np.savetxt('IMDB-vocab.txt',voc_im,fmt="%s",delimiter='\t')

In [0]:
## function that converts a review to ids
def conv(strr,voc):
    arr = []
    for s in strr.split():
        if s in voc:
            arr.append(voc.index(s))
    str1 = ' '.join(str(e) for e in arr)
    return str1

## create the id encoded datapoints  - This chunk takes around 40 minutes.
im_train_f_num = []
for i in im_train_f:
    im_train_f_num.append(conv(i,words_im))
im_valid_f_num = []
for i in im_valid_f:
    im_valid_f_num.append(conv(i,words_im))
im_test_f_num = []
for i in im_test_f:
    im_test_f_num.append(conv(i,words_im))


In [0]:
num_im_train = np.c_[im_train_f_num,im_train_t]
num_im_valid = np.c_[im_valid_f_num,im_valid_t]
num_im_test = np.c_[im_test_f_num,im_test_t]
## now save the required train/valid/test files.
np.savetxt('IMDB-train.txt',num_im_train,fmt="%s",delimiter='\t')
np.savetxt('IMDB-valid.txt',num_im_valid,fmt="%s",delimiter='\t')
np.savetxt('IMDB-test.txt',num_im_test,fmt="%s",delimiter='\t')

In [0]:
from sklearn import metrics
from sklearn.model_selection import ParameterGrid
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB


In [0]:

## Naive Bayes:

## First do the hyperparameter tuning:
param_grid3 = ParameterGrid({'alpha':[0.001,0.01,0.05,0.1,0.2,0.3,0.5,0.7,0.8,1]})
scores3 = []
for p in param_grid3:
    clf = BernoulliNB(**p)
    clf.fit(vec_im_train_bin,im_train_t)
    pred = clf.predict(vec_im_valid_bin)
    scores3.append(metrics.f1_score(im_valid_t,pred,average='micro'))
print(scores3)

[0.8422999999999999, 0.8425, 0.843, 0.8431000000000001, 0.843, 0.8434, 0.8433, 0.8433, 0.843, 0.843]


In [0]:
## Then train the Naive Bayes model using best hyperparameter:
nb3 = BernoulliNB(alpha = 0.3)
nb3.fit(vec_im_train_bin,im_train_t)
y_nb_im_train = nb3.predict(vec_im_train_bin)
y_nb_im_valid = nb3.predict(vec_im_valid_bin)
y_nb_im_test = nb3.predict(vec_im_test_bin)
f1_nb_im_train = metrics.f1_score(im_train_t,y_nb_im_train,average='micro')
f1_nb_im_valid = metrics.f1_score(im_valid_t,y_nb_im_valid,average='micro')
f1_nb_im_test = metrics.f1_score(im_test_t,y_nb_im_test,average='micro')

## f1-measure for train/valid/test for naive bayes:
print(f1_nb_im_train)
print(f1_nb_im_valid)
print(f1_nb_im_test)

0.8727999999999999
0.8434
0.83552


In [0]:
## Decision Trees:
## First do the hyperparameter tuning:  - this chunk takes several hours
max_depth3 = [int(x) for x in np.linspace(2,32,num=6)]
min_samples_split3 = [0.1,0.3,0.5,0.7,0.9,1.0]
min_samples_leaf3 = [0.1,0.3,0.5]
max_features3 = [1,100,1000,2000,5000,10000]
param_grid_dt3 = ParameterGrid({'max_depth':max_depth3, 'min_samples_split':min_samples_split3,'min_samples_leaf':min_samples_leaf3,'max_features':max_features3})
scores_dt3 = []
for p in param_grid_dt3:
    clf = DecisionTreeClassifier(**p)
    clf.fit(vec_im_train_bin,im_train_t)
    pred = clf.predict(vec_im_valid_bin)
    scores_dt3.append(metrics.f1_score(im_valid_t,pred,average='micro'))
print(scores_dt3)

[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5346, 0.5847, 0.5, 0.5635, 0.5, 0.529, 0.5, 0.5, 0.5, 0.534, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.611, 0.5482, 0.5728, 0.5721, 0.5497, 0.5441, 0.5551, 0.5273, 0.5635, 0.5721, 0.5635, 0.5426, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5544, 0.5912, 0.6081, 0.5847, 0.5847, 0.5407, 0.5465, 0.5635, 0.5441, 0.5441, 0.5635, 0.5349, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.611, 0.5847, 0.611, 0.5721, 0.5847, 0.611, 0.5635, 0.5721, 0.5635, 0.5551, 0.5485, 0.5721, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.611, 0.611, 0.611, 0.611, 0.611, 0.611, 0.5721, 0.5721, 0.5721, 0.5721, 0.5721, 0.5721, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5485, 0.5273, 0.5269, 0.5211, 0.5355, 0.5635, 0.5, 0.5, 0.5, 0.5251, 0.524, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5923, 0.569, 0.6144, 0.5502, 0.54, 0.5551, 0.5485, 0.5043, 0.5442, 0.5389, 0.5426, 0.5465, 0.5, 0.5, 0.5, 0

In [0]:
## print the best hyperparameters found
print(param_grid_dt3[np.argmax(scores_dt3)])
print(scores_dt3[np.argmax(scores_dt3)])

{'min_samples_split': 0.1, 'min_samples_leaf': 0.1, 'max_features': 5000, 'max_depth': 8}
0.6567


In [0]:
## Then train the DecisionTree model using the best hyperparameters:
dt3 = DecisionTreeClassifier(max_depth=8, min_samples_split = 0.1, min_samples_leaf = 0.1, max_features=5000)
dt3.fit(vec_im_train_bin,im_train_t)
y_dt_im_train = dt3.predict(vec_im_train_bin)
y_dt_im_valid = dt3.predict(vec_im_valid_bin)
y_dt_im_test = dt3.predict(vec_im_test_bin)
f1_dt_im_train = metrics.f1_score(im_train_t,y_dt_im_train,average='micro')
f1_dt_im_valid = metrics.f1_score(im_valid_t,y_dt_im_valid,average='micro')
f1_dt_im_test = metrics.f1_score(im_test_t,y_dt_im_test,average='micro')

## f1-measure for train/valid/test for decision trees:
print(f1_dt_im_train)
print(f1_dt_im_valid)
print(f1_dt_im_test)

0.633
0.6312
0.62764


In [0]:
## Linear SVM:
## First do the hyperparameter tuning:
param_grid_svm3 = ParameterGrid({'C':[0.001,0.01,0.1,1,10,25,50,100,1000]})
scores_svm3 = []
for p in param_grid_svm3:
    clf = LinearSVC(**p)
    clf.fit(vec_im_train_bin,im_train_t)
    pred = clf.predict(vec_im_valid_bin)
    scores_svm3.append(metrics.f1_score(im_valid_t,pred,average='micro'))
print(scores_svm3)  ## print f1-scores for different values of C

[0.8688, 0.8759, 0.8582, 0.8442, 0.8407, 0.8413000000000002, 0.8411, 0.8404, 0.8406]


In [0]:
## Then train the Linear SVM model using best hyperparameter:
svm3 = LinearSVC(C=0.01)
svm3.fit(vec_im_train_bin,im_train_t)
y_svm_im_train = svm3.predict(vec_im_train_bin)
y_svm_im_valid = svm3.predict(vec_im_valid_bin)
y_svm_im_test = svm3.predict(vec_im_test_bin)
f1_svm_im_train = metrics.f1_score(im_train_t,y_svm_im_train,average='micro')
f1_svm_im_valid = metrics.f1_score(im_valid_t,y_svm_im_valid,average='micro')
f1_svm_im_test = metrics.f1_score(im_test_t,y_svm_im_test,average='micro')

## f1-measure for train/valid/test for linear SVM:
print(f1_svm_im_train)
print(f1_svm_im_valid)
print(f1_svm_im_test)

0.9636666666666667
0.8759
0.87116
