# Assigment 3

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
import sklearn.naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm, metrics
from sklearn.dummy import DummyClassifier
import string
from scipy import stats
import random
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
path_to_data = "../data/"
path_to_gen = "../generated/"

# paht to imbd
imbd_train = "IMDB-train.txt"
imbd_val = "IMDB-valid.txt"
imbd_test = "IMDB-test.txt"

# paht to yelp
yelp_train = "yelp-train.txt"
yelp_val = "yelp-valid.txt"
yelp_test = "yelp-test.txt"

In [19]:
#hyperparams setup
n_fine_tune = 50
bnb_alphas = [0, 1]
dtc_max_depths = [1, 32]
dtc_max_min_samples_split = [0.1, 1.0]
dtc_max_min_samples_leaf =[0.1, 0.5]
lSVM_cs = [1e-1,10]
gnb_vs = [1e-10, 1e-1]

In [None]:
# get the data as df
df_imbd_train = pd.read_csv(path_to_data + imbd_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_test = pd.read_csv(path_to_data + imbd_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_val = pd.read_csv(path_to_data + imbd_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

df_yelp_train = pd.read_csv(path_to_data + yelp_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_test = pd.read_csv(path_to_data + yelp_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_val = pd.read_csv(path_to_data + yelp_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

## Question 1

In [None]:
# count frequency of normalized x
def normalize_df(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, ' '.join(list(map(normalizer, np.array(e.review)))).split(' ')))

def normalize_str(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, list(map(normalizer, e.split(' ')))))
    
def count_word_frequency(words):
    occurence = {}
    for w in words:
        if w in occurence:
            occurence[w] += 1
        else:
            occurence[w] = 1
    return occurence

def get_most_n_frequent(data, n = 10000):
    normed = normalize_df(data)
    count_dict = count_word_frequency(normed)
    n_sorted_words = (sorted(count_dict.items(), key=lambda kv: kv[1], reverse = True)[:n])
    return [k for k,v in n_sorted_words]
    
def gen_vec(data, data_train, most_freq):
    all_vector_b = []
    all_vector_f = []
    for review, class_id in tqdm(zip(data.review, data['class']), total = len(data.review)):
        vector_b, vector_f = list(np.zeros(10000)), list(np.zeros(10000))
        sum_ = 0
        norm_rev = normalize_str(review)
        for word in norm_rev:
            try:
                i = most_freq.index(word)
                vector_b[i] = 1
                vector_f[i] += 1
                sum_ += 1
            except:
                pass
        # if no word is recognized
        if sum_ == 0:
            sum_ = 1
        vector_f = [float(e/sum_) for e in vector_f]
        vector_b.append(int(class_id))
        vector_f.append(int(class_id))
        all_vector_b.append(np.array(vector_b))
        all_vector_f.append(np.array(vector_f))
    
    return np.array(all_vector_b).astype(int), np.array(all_vector_f)

In [None]:
imbd_most_freq = get_most_n_frequent(df_imbd_train)
yelp_most_freq = get_most_n_frequent(df_yelp_train)

In [None]:
# binary bag-of-words representation
df_imbd_train_BBoW, df_imbd_train_FBoW = gen_vec(df_imbd_train, df_imbd_train, imbd_most_freq)
df_yelp_train_BBoW, df_yelp_train_FBoW = gen_vec(df_yelp_train, df_yelp_train, yelp_most_freq)
df_imbd_test_BBoW, df_imbd_test_FBoW = gen_vec(df_imbd_test, df_imbd_train, imbd_most_freq)
df_yelp_test_BBoW, df_yelp_test_FBoW = gen_vec(df_yelp_test, df_yelp_train, yelp_most_freq)
df_imbd_val_BBoW, df_imbd_val_FBoW = gen_vec(df_imbd_val, df_imbd_train, imbd_most_freq)
df_yelp_val_BBoW, df_yelp_val_FBoW = gen_vec(df_yelp_val, df_yelp_train, yelp_most_freq)

In [None]:
# save the dataframes just created
def save_gen(arrays, type_, dataset_name):
    # dataframes in order [train, val, test]
    suffixes = ["-train-", "-valid-", "-test-"]
    for a, s in zip(arrays, suffixes):
        np.savetxt(path_to_gen + dataset_name + s + type_ + '.txt', a, fmt='%s')
        
save_gen([df_imbd_train_BBoW, df_imbd_val_BBoW, df_imbd_test_BBoW], "BBoW", "IMBD")
save_gen([df_yelp_train_BBoW, df_yelp_val_BBoW, df_yelp_test_BBoW], "BBoW", "yelp")
save_gen([df_imbd_train_FBoW, df_imbd_val_FBoW, df_imbd_test_FBoW], "FBoW", "IMBD")
save_gen([df_yelp_train_FBoW, df_yelp_val_FBoW, df_yelp_test_FBoW], "FBoW", "yelp")

## Question 2

Using data from **yelp** created with **BBoW** only.

In [4]:
df_yelp_train_BBoW = np.loadtxt(path_to_gen + "yelp-train-BBoW.txt")

In [5]:
df_yelp_val_BBoW = np.loadtxt(path_to_gen + "yelp-valid-BBoW.txt")

In [6]:
df_yelp_test_BBoW = np.loadtxt(path_to_gen + "yelp-test-BBoW.txt")

In [28]:
x_tr = np.nan_to_num(df_yelp_train_BBoW[:,:-1])
y_tr = df_yelp_train_BBoW[:,-1]

x_val = np.nan_to_num(df_yelp_val_BBoW[:,:-1])
y_val = df_yelp_val_BBoW[:,-1]

x_test = np.nan_to_num(df_yelp_test_BBoW[:,:-1])
y_test = df_yelp_test_BBoW[:,-1]

### Part A

In [29]:
def performance_random_clf(x_tr, y_tr, x_test, y_test, r = 1234):
    clf = DummyClassifier(strategy = 'uniform', random_state = r)
    clf.fit(x_tr, y_tr)
    preds = clf.predict(x_test)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1
    
def performance_majority_classifier(x_tr, y_tr, x_test, y_test, r = 1234):
    most_common_val = stats.mode(y_tr).mode[0]
    preds = np.full((y_test.shape), most_common_val)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1

In [30]:
baseline_random = performance_random_clf(x_tr, y_tr, x_test, y_test)
baseline_majority = performance_majority_classifier(x_tr, y_tr, x_test, y_test)

In [31]:
print("The performance for the random classifier on the Yelp " + 
      "reviews dataset created with BBoW is %s." % baseline_random)
print("The performance for the majority classifier on the Yelp " +
      "reviews dataset created with BBoW is %s." % baseline_majority)

The performance for the random classifier on the Yelp reviews dataset created with BBoW is 0.1975.
The performance for the majority classifier on the Yelp reviews dataset created with BBoW is 0.351.


### Part B

In [32]:
def fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, number_of_model, alphas):
    all_as = random.choices(list(np.linspace(alphas[0],
                                             alphas[1],
                                             number_of_model*100, endpoint=True)),
                            k=number_of_model)
    
    f1_scores = []
    best_model = None
    for a in tqdm(all_as):
        model = BernoulliNB(alpha = a)
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    print("The values of alphas considered are:", all_as)
    print("The best f1 score is:", max(f1_scores))
    print("The alpha is set to:", best_model.get_params()['alpha'])
    return best_model

In [33]:
def fine_tune_decision_tree_classifier(x_tr, y_tr, x_val, y_val,
                                       number_of_model, max_depths,
                                       min_samples_split, min_samples_leaf):
    
    all_md = random.choices(list(np.linspace(max_depths[0], max_depths[1],
                                             number_of_model*100, endpoint=True)),
                            k=number_of_model)
    
    all_mss = random.choices(list(np.linspace(min_samples_split[0], min_samples_split[1],
                                              number_of_model*100, endpoint=True)),
                             k=number_of_model)

    all_msl = random.choices(list(np.linspace(min_samples_leaf[0], min_samples_leaf[1],
                                              number_of_model*100, endpoint=True)),
                             k=number_of_model)

    f1_scores = []
    best_model = None
    for md, mss, msl in tqdm(zip(all_md, all_mss, all_msl), total=number_of_model):
        model = DecisionTreeClassifier(max_depth=md, min_samples_split=mss, min_samples_leaf=msl)
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    print("The values of max_depth considered are:", all_md)
    print("The values of min_samples_split considered are:", all_mss)
    print("The values of min_samples_leaf considered are:", all_msl)
    print("The best f1 score is:", max(f1_scores))
    print("The max depth is set to:", best_model.get_params()['max_depth'])
    print("The min samples split is set to:", best_model.get_params()['min_samples_split'])
    print("The min samples leaf is set to:", best_model.get_params()['min_samples_leaf'])
    return best_model

In [34]:
def fine_tune_linear_SVM(x_tr, y_tr, x_val, y_val,
                         number_of_model, cs):
    all_c = random.choices(list(np.linspace(cs[0], cs[1],
                                            number_of_model*100, endpoint=True)), 
                           k=number_of_model)

    f1_scores = []
    best_model = None
    for c in tqdm(all_c):
        model = svm.LinearSVC(C=c, max_iter=1000000) # max_iter is a very large number so there is convergence
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    print("The values of Cs considered are:", all_c)
    print("The best f1 score is:", max(f1_scores))
    print("The c is set to:", best_model.get_params()['C'])
    return best_model

In [35]:
model_bnb_q2 = fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                            bnb_alphas)



The values of alphas considered are: [0.97979595919183837, 0.48429685937187439, 0.97159431886377279, 0.94258851770354068, 0.54770954190838173, 0.74954990998199644, 0.1940388077615523, 0.55411082216443286, 0.48429685937187439, 0.44628925785157031, 0.66453290658131625, 0.12642528505701139, 0.9613922784556912, 0.55791158231646332, 0.088817763552710538, 0.04160832166433287, 0.0040008001600320064, 0.21784356871374275, 0.56911382276455291, 0.21664332866573316, 0.60612122424484893, 0.61252250450090018, 0.50370074014802957, 0.70414082816563317, 0.90598119623924789, 0.55091018203640729, 0.93318663732746554, 0.48009601920384076, 0.53670734146829369, 0.96419283856771354, 0.92078415683136627, 0.70514102820564117, 0.77475495099019809, 0.2488497699539908, 0.084416883376675342, 0.78835767153430691, 0.011802360472094419, 0.45129025805161033, 0.55811162232446487, 0.30486097219443886, 0.81756351270254046, 0.33886777355471093, 0.9155831166233247, 0.76555311062212439, 0.78115623124624922, 0.745349069813

In [36]:
model_dtc_q2 = fine_tune_decision_tree_classifier(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                               dtc_max_depths, dtc_max_min_samples_split, dtc_max_min_samples_leaf)


The values of max_depth considered are: [4.8385677135427088, 30.418683736747347, 19.082816563312662, 7.0276055211042205, 5.0866173234646928, 27.535107021404279, 9.5143028605721138, 1.5209041808361672, 18.772754550910182, 22.592718543708742, 22.555511102220443, 11.895579115823164, 5.0246049209841965, 22.753950790158029, 19.058011602320462, 31.981396279255851, 8.385677135427084, 11.095619123824765, 15.380676135227045, 11.120424084816962, 2.4200840168033606, 18.264252850570113, 11.49249849969994, 4.1378275655131027, 31.125625125025003, 29.017203440688135, 1.3286657331466294, 18.952590518103619, 23.956991398279655, 11.418083616723344, 28.366073214642928, 11.628925785157032, 28.18623724744949, 23.72134426885377, 4.6463292658531703, 28.793958791758349, 18.053410682136427, 8.1314262852570511, 11.883176635327064, 16.019403880776153, 17.972794558911783, 12.844368873774755, 16.552710542108422, 24.831366273254652, 7.5237047409481894, 24.756951390278054, 29.972194438887776, 8.5035007001400267, 28

In [37]:
model_lsvm_q2 = fine_tune_linear_SVM(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                  lSVM_cs)


The values of Cs considered are: [6.8650330066013199, 9.6752150430086026, 2.175455091018204, 6.1382276455291054, 3.741948389677936, 8.8058211642328459, 9.5227245449089821, 1.2288257651530308, 7.623524704940988, 0.69609921984396883, 5.9322664532906577, 7.3640928185637122, 0.78125625125025, 9.6791758351670332, 8.6473894778955795, 5.4906381276255249, 5.5500500100019998, 2.5180636127225449, 1.163472694538908, 1.010982196439288, 3.4151830366073215, 2.4329065813162636, 5.1183236647329462, 9.5940188037607523, 5.1915983196639326, 3.4607321464292862, 5.3322064412882577, 6.5065813162632526, 9.5524304860972187, 0.24060812162432488, 6.2709141828365675, 0.56539307861572319, 5.9976195239047811, 3.741948389677936, 2.9359271854370874, 9.2830966193238655, 1.0882176435287059, 4.6846169233846764, 1.2486297259451891, 4.942068413682736, 0.97335467093418682, 5.1321864372874577, 9.9346469293858775, 7.8572114422884578, 4.385577115423084, 6.9957391478295659, 3.7696739347869577, 2.7992798559711942, 3.025045009

## Question 3

In [38]:
df_yelp_train_FBoW = np.loadtxt(path_to_gen + "yelp-train-FBoW.txt")

In [39]:
df_yelp_val_FBoW = np.loadtxt(path_to_gen + "yelp-valid-FBoW.txt")

In [40]:
df_yelp_test_FBoW = np.loadtxt(path_to_gen + "yelp-test-FBoW.txt")

In [41]:
x_tr = np.nan_to_num(df_yelp_train_FBoW[:,:-1])
y_tr = df_yelp_train_FBoW[:,-1]

x_val = np.nan_to_num(df_yelp_val_FBoW[:,:-1])
y_val = df_yelp_val_FBoW[:,-1]

x_test = np.nan_to_num(df_yelp_test_FBoW[:,:-1])
y_test = df_yelp_test_FBoW[:,-1]

In [42]:
def fine_tune_gaussian_naive_bayes(x_tr, y_tr, x_val, y_val, number_of_model, vars_smoothing):
    all_vs = random.choices(list(np.linspace(vars_smoothing[0],
                                             vars_smoothing[1],
                                             number_of_model*100, endpoint=True)),
                            k=number_of_model)
    f1_scores = []
    best_model = None
    for vs in tqdm(all_vs):
        model = sklearn.naive_bayes.GaussianNB(priors=None, var_smoothing=vs)
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    print("The values of var_smoothing considered are:", all_vs)
    print("The best f1 score is:", max(f1_scores))
    print("The var_smoothing is set to:", best_model.get_params()['var_smoothing'])
    return best_model

In [43]:
model_gnb_q3 = fine_tune_gaussian_naive_bayes(x_tr, y_tr, x_val, y_val, n_fine_tune,
                               gnb_vs)


The values of var_smoothing considered are: [0.043808761808541714, 0.045509101874854976, 0.029245849240588123, 0.015903180720224046, 0.040248049669673942, 0.046749349923224652, 0.067213442721324262, 0.052170434134646934, 0.02678535714462893, 0.089617923595099028, 0.020384076894978998, 0.067573514735367074, 0.055291058256351278, 0.063832766589477896, 0.014502900665613125, 0.017843568795899184, 0.061872374513022611, 0.068173634758771751, 0.02306461299951991, 0.091638327673894779, 0.036967393541728351, 0.079395879196439298, 0.0084416884292258451, 0.035187037472294465, 0.037187437550310065, 0.031466293327185445, 0.099699939988297667, 0.029505901250730149, 0.057971594360892185, 0.011422284545469095, 0.043568713799179842, 0.067113422717423488, 0.060652130465433093, 0.038847769615063019, 0.032866573381796362, 0.037867573576835373, 0.035167033471514307, 0.013562712628945791, 0.033266653397399489, 0.084236847385237054, 0.0781156231465093, 0.078655731167573517, 0.00068013612652530523, 0.0661932

In [44]:
model_dtc_q3 = fine_tune_decision_tree_classifier(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                               dtc_max_depths, dtc_max_min_samples_split, dtc_max_min_samples_leaf)


The values of max_depth considered are: [13.774554910982197, 3.0712142428485696, 11.356071214242847, 6.1036207241448288, 2.1286257251450289, 10.605721144228845, 26.24524904980996, 18.723144628925784, 28.96759351870374, 9.3716743348669738, 12.428885777155431, 24.242248449689939, 11.263052610522104, 1.0372074414882977, 2.0046009201840365, 1.2046409281856372, 21.352470494098821, 11.796359271854371, 3.4122824564912984, 15.324864972994598, 8.0012002400480107, 1.3782756551310262, 6.835367073414683, 30.827965593118623, 9.2352470494098817, 13.452090418083616, 16.044208841768352, 5.5641128225645131, 27.479295859171835, 1.8495699139827966, 1.700740148029606, 3.5425085017003402, 15.269053810762152, 12.360672134426885, 4.4850970194038808, 7.3438687737547506, 7.2384476895379075, 29.686937387477496, 17.482896579315863, 24.68873774754951, 12.478495699139827, 28.911782356471292, 19.585117023404681, 22.865573114622922, 21.451690338067614, 15.690738147629526, 18.462692538507699, 29.637327465493097, 24.

In [45]:
model_lsvm_q3 = fine_tune_linear_SVM(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                  lSVM_cs)


The values of Cs considered are: [1.1555511102220446, 3.080496099219844, 5.7302660532106415, 3.3023004600920185, 6.4293458691738348, 0.88819763952790554, 8.0235647129425889, 3.8409681936387279, 1.9516703340668136, 6.0015803160632126, 9.7504700940188034, 5.4411282256451292, 4.3063612722544509, 7.936427285457091, 1.7160032006401282, 7.9661332266453293, 0.91988397679535905, 2.9775155031006202, 6.8214642928585718, 4.3835967193438687, 0.12178435687137429, 8.2433886777355472, 7.2353670734146824, 0.79511902380476096, 9.3227045409081821, 0.50202040408081616, 1.8110622124424887, 5.5599519903980799, 3.6884776955391079, 4.3954790958191641, 8.4137027405481088, 2.9497899579915985, 9.592038407681537, 5.3678535707141428, 2.0982196439287857, 0.94562912582516501, 5.2767553510702134, 5.7758151630326067, 3.6746149229845972, 4.7578915783156628, 4.9757351470294058, 6.2431886377275454, 8.6850170034006808, 0.18317663532706541, 3.120104020804161, 3.2904180836167236, 4.6628325665133028, 2.8586917383476695, 9.

## Question 4

In [46]:
df_imbd_train_BBoW = np.loadtxt(path_to_gen + "IMBD-train-BBoW.txt")

In [47]:
df_imbd_val_BBoW = np.loadtxt(path_to_gen + "IMBD-valid-BBoW.txt")

In [48]:
df_imbd_test_BBoW = np.loadtxt(path_to_gen + "IMBD-test-BBoW.txt")

In [52]:
x_tr = np.nan_to_num(df_imbd_train_BBoW[:,:-1])
y_tr = df_imbd_train_BBoW[:,-1]

x_val = np.nan_to_num(df_imbd_val_BBoW[:,:-1])
y_val = df_imbd_val_BBoW[:,-1]

x_test = np.nan_to_num(df_imbd_test_BBoW[:,:-1])
y_test = df_imbd_test_BBoW[:,-1]

In [53]:
baseline_random = performance_random_clf(x_tr, y_tr, x_test, y_test)
baseline_majority = performance_majority_classifier(x_tr, y_tr, x_test, y_test)

In [54]:
print("The performance for the random classifier on the Yelp " + 
      "reviews dataset created with BBoW is %s." % baseline_random)
print("The performance for the majority classifier on the Yelp " +
      "reviews dataset created with BBoW is %s." % baseline_majority)

The performance for the random classifier on the Yelp reviews dataset created with BBoW is 0.50344.
The performance for the majority classifier on the Yelp reviews dataset created with BBoW is 0.5.


In [55]:
model_bnb_q4 = fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                            bnb_alphas)


The values of alphas considered are: [0.2464492898579716, 0.50890178035607125, 0.52270454090818164, 0.77135427085417085, 0.74814962992598522, 0.20784156831366274, 0.8097619523904781, 0.57411482296459293, 0.98559711942388473, 0.54310862172434482, 0.53690738147629524, 0.85277055411082214, 0.91158231646329269, 0.083816763352670534, 0.19103820764152832, 0.80016003200640129, 0.81876375275055013, 0.80276055211042208, 0.63152630526105225, 0.93878775755151034, 0.23724744948989798, 0.93818763752750556, 0.43088617723544709, 0.24024804960992199, 0.31426285257051412, 0.375875175035007, 0.75615123024604924, 0.9541908381676335, 0.39647929585917185, 0.75135027005401078, 0.31086217243448688, 0.40108021604320865, 0.16483296659331867, 0.21684336867373474, 0.91118223644728946, 0.21364272854570915, 0.74494898979795965, 0.30786157231446287, 0.56591318263652735, 0.99739947989597921, 0.45689137827565512, 0.23704740948189637, 0.34946989397879574, 0.23764752950590118, 0.75315063012602523, 0.11762352470494099,

In [56]:
model_dtc_q4 = fine_tune_decision_tree_classifier(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                               dtc_max_depths, dtc_max_min_samples_split, dtc_max_min_samples_leaf)


The values of max_depth considered are: [18.295259051810362, 1.0744148829765954, 23.640728145629126, 4.6835367073414682, 26.592518503700738, 4.8881776355271054, 8.1314262852570511, 30.982996599319861, 2.2712542508501699, 15.740348069613923, 24.942988597719545, 19.628525705141026, 5.0680136027205442, 8.5531106221244251, 24.583316663332667, 25.45769153830766, 23.119823964792957, 5.5207041408281654, 2.3332666533306661, 26.666933386677336, 11.858371674334867, 14.748149629925985, 6.4508901780356069, 24.366273254650931, 19.678135627125425, 22.195839167833565, 23.330666133226643, 9.6259251850370067, 6.5935187037407479, 21.457891578315664, 28.235847169433885, 9.7685537107421485, 5.8307661532306456, 15.417883576715342, 26.580116023204639, 7.3500700140028004, 30.803160632126424, 22.853170634126826, 1.6015203040608121, 22.301260252050408, 2.0790158031606323, 11.616523304660932, 11.740548109621924, 1.6759351870374075, 17.613122624524905, 21.340068013602721, 25.091818363672733, 22.772554510902179,

In [57]:
model_lsvm_q4 = fine_tune_linear_SVM(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                  lSVM_cs)


The values of Cs considered are: [7.5086617323464688, 8.4592518503700731, 9.9960392078415676, 5.5579715943188637, 1.0446489297859574, 6.3659731946389275, 1.5615323064612925, 5.12624524904981, 3.9855371074214845, 4.4251850370074015, 3.955831166233247, 5.6055011002200441, 9.0216843368673736, 4.141988397679536, 6.4313262652530501, 9.4890578115623132, 8.928605721144228, 6.3481496299259854, 5.8669133826765352, 6.9957391478295659, 8.6176835367073412, 6.057031406281256, 8.4691538307661531, 5.0707941588317658, 0.44062812562512499, 7.8651330266053208, 6.8689937987597522, 0.75749149829965989, 7.6730346069213846, 5.0252450490098015, 6.813542708541708, 6.6768953790758152, 8.4354870974194842, 6.718483696739348, 0.80700140028005607, 7.1284256851370271, 9.0791158231646332, 6.7580916183236646, 5.8015603120624126, 5.6550110022004398, 9.4870774154830961, 3.7043208641728347, 7.3383476695339063, 1.7496699339867976, 0.98127625525105022, 8.4117223444688936, 1.3852770554110823, 7.9304860972194442, 4.0627725

## Question 5

In [58]:
df_imbd_train_FBoW = np.loadtxt(path_to_gen + "IMBD-train-FBoW.txt")

In [59]:
df_imbd_val_FBoW = np.loadtxt(path_to_gen + "IMBD-valid-FBoW.txt")

In [60]:
df_imbd_test_FBoW = np.loadtxt(path_to_gen + "IMBD-test-FBoW.txt")

In [61]:
x_tr = np.nan_to_num(df_imbd_train_FBoW[:,:-1])
y_tr = df_imbd_train_FBoW[:,-1]

x_val = np.nan_to_num(df_imbd_val_FBoW[:,:-1])
y_val = df_imbd_val_FBoW[:,-1]

x_test = np.nan_to_num(df_imbd_test_FBoW[:,:-1])
y_test = df_imbd_test_FBoW[:,-1]

In [62]:
model_gnb_q5 = fine_tune_gaussian_naive_bayes(x_tr, y_tr, x_val, y_val, n_fine_tune,
                               gnb_vs)


The values of var_smoothing considered are: [0.092298459699639929, 0.098559711943828773, 0.020924184916043212, 0.088977795570134036, 0.04438887783116624, 0.010742148518943789, 0.058671734388197651, 0.071294258880476094, 0.098279655932906593, 0.011002200529085819, 0.04928985802230447, 0.0022004401858171639, 0.053470694185357076, 0.06469293862302461, 0.079695939208141636, 0.05103020609017804, 0.041308261711022212, 0.060452090457631537, 0.064732946624584925, 0.069833966823524704, 0.059891978435787169, 0.035167033471514307, 0.011442288546249251, 0.080736147248709741, 0.060812162471674341, 0.030106021274134833, 0.081076215261972395, 0.076055211066153233, 0.062072414520824175, 0.011382276543908783, 0.083276655347789566, 0.066093218677635526, 0.099199839968793765, 0.035327065477755555, 0.053070614169753956, 0.013802760638307663, 0.020344068893418686, 0.0067013403613522711, 0.028485697210942195, 0.08879775956311263, 0.009661932476815363, 0.060632126464652943, 0.069733946819623929, 0.052990598

In [63]:
model_dtc_q5 = fine_tune_decision_tree_classifier(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                              dtc_max_depths, dtc_max_min_samples_split, dtc_max_min_samples_leaf)


The values of max_depth considered are: [7.3128625725145024, 23.318263652730547, 18.506101220244048, 22.865573114622922, 17.19143828765753, 1.4030806161232245, 19.975795159031804, 10.326665333066613, 31.832566513302659, 2.8665733146629329, 22.400480096019201, 21.389677935587116, 1.2170434086817363, 19.485897179435888, 8.4414882976595322, 21.780356071214243, 20.447089417883575, 7.1702340468093615, 4.9873974794958986, 1.5519103820764153, 8.3546709341868368, 1.0372074414882977, 10.983996799359872, 10.642928585717144, 9.1050210042008395, 12.354470894178835, 2.0790158031606323, 16.11862372474495, 10.332866573314663, 10.605721144228845, 22.425285057011401, 29.25905181036207, 29.705541108221645, 4.7827565513102623, 7.5175035007001396, 20.558711742348468, 20.416083216643329, 18.586717343468692, 8.6461292258451685, 18.673534706941389, 16.323264652930586, 9.5205041008201636, 18.084416883376676, 3.5859171834366874, 31.937987597519502, 25.234446889377875, 9.3282656531306252, 6.1222244448889773, 1

In [64]:
model_lsvm_q5 = fine_tune_linear_SVM(x_tr, y_tr, x_val, y_val, n_fine_tune,
                                  lSVM_cs)


The values of Cs considered are: [6.1976395279055811, 7.823544708941788, 7.2413082616523301, 1.959591918383677, 6.7085817163432688, 6.5838167633526705, 7.5601520304060807, 7.0333666733346671, 6.1916983396679335, 6.9838567713542705, 1.4446889377875576, 5.969893978795759, 9.843548709741949, 7.7661132226445284, 0.1594118823764753, 5.5896579315863173, 0.20100020004000801, 1.3575515103020606, 4.5855971194238849, 9.0177235447089412, 8.9484096819363881, 5.5183636727345471, 7.4650930186037208, 7.6552110422084416, 6.7303660732146424, 9.5979795959191847, 5.3935987197439488, 0.37527505501100222, 7.9839567913582714, 6.6155031006201241, 7.0610922184436884, 2.6903580716143232, 0.9416683336667333, 1.3674534906981397, 9.7405681136227251, 8.5840168033606723, 5.1302060412082415, 1.1139627925585118, 6.0114822964592918, 1.9378075615123027, 8.364192838567714, 2.4447889577915585, 6.3105221044208841, 2.967613522704541, 3.4547909581916385, 9.0236647329465889, 8.3582516503300663, 1.7437287457491499, 3.5934186