# Assigment 3

In [31]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree, svm, metrics
from sklearn.dummy import DummyClassifier
import string
from scipy import stats
import random
from tqdm import tqdm_notebook as tqdm

In [1]:
path_to_data = "../data/"
path_to_gen = "../generated/"

# paht to imbd
imbd_train = "IMDB-train.txt"
imbd_val = "IMDB-valid.txt"
imbd_test = "IMDB-test.txt"

# paht to yelp
yelp_train = "yelp-train.txt"
yelp_val = "yelp-valid.txt"
yelp_test = "yelp-test.txt"

In [3]:
# get the data as df
df_imbd_train = pd.read_csv(path_to_data + imbd_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_test = pd.read_csv(path_to_data + imbd_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_val = pd.read_csv(path_to_data + imbd_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

df_yelp_train = pd.read_csv(path_to_data + yelp_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_test = pd.read_csv(path_to_data + yelp_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_val = pd.read_csv(path_to_data + yelp_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

## Question 1

In [5]:
# count frequency of normalized x
def normalize_df(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, ' '.join(list(map(normalizer, np.array(e.review)))).split(' ')))

def normalize_str(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, list(map(normalizer, e.split(' ')))))
    
    
def count_word_frequency(words):
    occurence = {}
    for w in words:
        if w in occurence:
            occurence[w] += 1
        else:
            occurence[w] = 1
    return occurence

def get_most_n_frequent(data, n):
    normed = normalize_df(data)
    count_dict = count_word_frequency(normed)
    n_sorted_words = (sorted(count_dict.items(), key=lambda kv: kv[1], reverse = True)[:n])
    return [k for k,v in n_sorted_words]
    
def gen_vec(data, data_train):
    n = 10000
    most_freq = []
    df_vector_b = pd.DataFrame(columns=['review', 'class'])
    df_vector_f = pd.DataFrame(columns=['review', 'class'])

    # check if we use the data to generate the train or not
    most_freq = get_most_n_frequent(data_train, n)
    
    for review, class_id in tqdm(zip(data.review, data['class']), total = len(data.review)):
        vector_b, vector_f = np.zeros(n), np.zeros(n)  
        sum_ = 0
        for word in normalize_str(review):
            if word in most_freq:
                i = most_freq.index(word)
                vector_b[i] = 1
                vector_f[i] += 1
                sum_ += 1
        vector_f = [float(e/sum_) for e in vector_f]
        df_vector_b = df_vector_b.append({'review': np.array(vector_b), 'class': class_id}, ignore_index=True)
        df_vector_f = df_vector_f.append({'review': np.array(vector_f), 'class': class_id}, ignore_index=True)
    return df_vector_b, df_vector_f

In [6]:
# binary bag-of-words representation
df_imbd_train_BBoW, df_imbd_train_FBoW = gen_vec(df_imbd_train, df_imbd_train)
df_yelp_train_BBoW, df_yelp_train_FBoW = gen_vec(df_yelp_train, df_yelp_train)
df_imbd_test_BBoW, df_imbd_test_FBoW = gen_vec(df_imbd_test, df_imbd_train)
df_yelp_test_BBoW, df_yelp_test_FBoW = gen_vec(df_yelp_test, df_yelp_train)
df_imbd_val_BBoW, df_imbd_val_FBoW = gen_vec(df_imbd_val, df_imbd_train)
df_yelp_val_BBoW, df_yelp_val_FBoW = gen_vec(df_yelp_val, df_yelp_train)





















In [11]:
# save the dataframes just created
def save_gen(dataframes, type_, dataset_name):
    # dataframes in order [train, val, test]
    suffixes = ["-train-", "-valid-", "-test-"]
    for df, s in zip(dataframes, suffixes):
        df.to_csv(path_to_gen + dataset_name + s + type_, header=None, index=None, sep=' ', mode='a')
        
save_gen([df_imbd_train_BBoW, df_imbd_val_BBoW, df_imbd_test_BBoW], "BBoW", "IMBD")
save_gen([df_yelp_train_BBoW, df_yelp_val_BBoW, df_yelp_test_BBoW], "BBoW", "yelp")
save_gen([df_imbd_train_FBoW, df_imbd_val_FBoW, df_imbd_test_FBoW], "FBoW", "IMBD")
save_gen([df_yelp_train_FBoW, df_yelp_val_FBoW, df_yelp_test_FBoW], "FBoW", "yelp")

## Question 2

Using data from **yelp** created with **BBoW** only.

In [16]:
x_tr = np.array([np.array(x) for x in df_yelp_train_BBoW['review'].values])
y_tr = np.array([np.array(x) for x in df_yelp_train_BBoW['class'].values])

x_val = np.array([np.array(x) for x in df_yelp_val_BBoW['review'].values])
y_val = np.array([np.array(x) for x in df_yelp_val_BBoW['class'].values])

x_test = np.array([np.array(x) for x in df_yelp_test_BBoW['review'].values])
y_test = np.array([np.array(x) for x in df_yelp_test_BBoW['class'].values])

### Part A

In [17]:
def performance_random_clf(x_tr, y_tr, x_test, y_test, r = 1234):
    clf = DummyClassifier(strategy = 'uniform', random_state = r)
    clf.fit(x_tr, y_tr)
    preds = clf.predict(x_test)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1
    
def performance_majority_classifier(x_tr, y_tr, x_test, y_test, r = 1234):
    most_common_val = stats.mode(y_tr).mode[0]
    preds = np.full((y_test.shape), most_common_val)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1

In [18]:
baseline_random = performance_random_clf(x_tr, y_tr, x_test, y_test)
baseline_majority = performance_majority_classifier(x_tr, y_tr, x_test, y_test)

In [19]:
print("The performance for the random classifier on the Yelp " + 
      "reviews dataset created with BBoW is %s." % baseline_random)
print("The performance for the majority classifier on the Yelp " +
      "reviews dataset created with BBoW is %s." % baseline_majority)

The performance for the random classifier on the Yelp reviews dataset created with BBoW is 0.1975.
The performance for the majority classifier on the Yelp reviews dataset created with BBoW is 0.351.


### Part B

In [None]:
def train(clf, x_tr, y_tr, x_val, y_val):
    clf.fit(x_tr, y_tr)
    

In [63]:
def fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, number_of_model = 20, alphas = [0, 100000]):
    f1_scores = []
    best_model = None
    for a in random.sample(range(alphas[0], alphas[1]), number_of_model):
        a = a/100000
        print("a:", a)
        model = BernoulliNB(alpha = a)
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        print("f1:", cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    return best_model



In [64]:
fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, 1000)

a: 0.48597
f1: 0.391
a: 0.27659
f1: 0.395
a: 0.99723
f1: 0.379
a: 0.9182
f1: 0.38
a: 0.09643
f1: 0.407
a: 0.40802
f1: 0.391
a: 0.97611
f1: 0.381
a: 0.55243
f1: 0.39
a: 0.54527
f1: 0.39
a: 0.76266
f1: 0.386
a: 0.6911
f1: 0.386
a: 0.12929
f1: 0.399
a: 0.62972
f1: 0.387
a: 0.6013
f1: 0.389
a: 0.56508
f1: 0.389
a: 0.69945
f1: 0.385
a: 0.59073
f1: 0.388
a: 0.88069
f1: 0.378
a: 0.78495
f1: 0.386
a: 0.06126
f1: 0.411
a: 0.331
f1: 0.396
a: 0.20637
f1: 0.395
a: 0.31039
f1: 0.396
a: 0.12352
f1: 0.403
a: 0.28756
f1: 0.395
a: 0.3511
f1: 0.394
a: 0.29777
f1: 0.394
a: 0.99849
f1: 0.379
a: 0.44925
f1: 0.392
a: 0.03178
f1: 0.418
a: 0.0071
f1: 0.421
a: 0.4399
f1: 0.392
a: 0.5727
f1: 0.388
a: 0.77926
f1: 0.386
a: 0.99245
f1: 0.379
a: 0.0539
f1: 0.411
a: 0.79057
f1: 0.384
a: 0.08291
f1: 0.409
a: 0.36006
f1: 0.394
a: 0.46815
f1: 0.391
a: 0.23445
f1: 0.394
a: 0.37278
f1: 0.393
a: 0.25533
f1: 0.393
a: 0.46883
f1: 0.391
a: 0.725
f1: 0.387
a: 0.09813
f1: 0.407
a: 0.98245
f1: 0.38
a: 0.67965
f1: 0.387
a: 0.204

f1: 0.394
a: 0.30905
f1: 0.395
a: 0.58101
f1: 0.388
a: 0.91607
f1: 0.381
a: 0.23775
f1: 0.394
a: 0.64681
f1: 0.386
a: 0.26073
f1: 0.393
a: 0.57013
f1: 0.388
a: 0.87712
f1: 0.379
a: 0.75294
f1: 0.386
a: 0.80652
f1: 0.383
a: 0.55421
f1: 0.39
a: 0.3657
f1: 0.394
a: 0.63127
f1: 0.387
a: 0.40529
f1: 0.392
a: 0.51455
f1: 0.39
a: 0.44234
f1: 0.392
a: 0.90057
f1: 0.381
a: 0.98945
f1: 0.379
a: 0.19693
f1: 0.395
a: 0.50608
f1: 0.39
a: 0.25676
f1: 0.393
a: 0.10073
f1: 0.406
a: 0.31028
f1: 0.396
a: 0.29628
f1: 0.394
a: 0.29853
f1: 0.394
a: 0.75529
f1: 0.387
a: 0.56221
f1: 0.389
a: 0.2099
f1: 0.395
a: 0.67082
f1: 0.388
a: 0.27869
f1: 0.395
a: 0.67085
f1: 0.388
a: 0.30167
f1: 0.394
a: 0.55861
f1: 0.389
a: 0.80644
f1: 0.383
a: 0.81968
f1: 0.383
a: 0.24091
f1: 0.394
a: 0.95585
f1: 0.38
a: 0.22441
f1: 0.394
a: 0.48547
f1: 0.391
a: 0.80518
f1: 0.383
a: 0.06011
f1: 0.412
a: 0.0046
f1: 0.423
a: 0.60059
f1: 0.389
a: 0.82778
f1: 0.384
a: 0.37127
f1: 0.394
a: 0.59097
f1: 0.388
a: 0.49684
f1: 0.391
a: 0.91146

f1: 0.387
a: 0.74262
f1: 0.386
a: 0.4888
f1: 0.391
a: 0.46905
f1: 0.391
a: 0.47264
f1: 0.391
a: 0.61586
f1: 0.389
a: 0.0305
f1: 0.418
a: 0.75357
f1: 0.386
a: 0.15674
f1: 0.397
a: 0.88653
f1: 0.379
a: 0.61933
f1: 0.389
a: 0.86444
f1: 0.381
a: 0.14912
f1: 0.397
a: 0.73978
f1: 0.387
a: 0.79046
f1: 0.384
a: 0.53913
f1: 0.39
a: 0.80135
f1: 0.383
a: 0.3782
f1: 0.393
a: 0.48357
f1: 0.391
a: 0.63094
f1: 0.387
a: 0.87182
f1: 0.38
a: 0.73008
f1: 0.387
a: 0.03423
f1: 0.417
a: 0.11771
f1: 0.402
a: 0.33508
f1: 0.395
a: 0.35782
f1: 0.394
a: 0.30524
f1: 0.396
a: 0.05491
f1: 0.412
a: 0.31352
f1: 0.395
a: 0.02964
f1: 0.419
a: 0.89434
f1: 0.38
a: 0.92843
f1: 0.38
a: 0.3079
f1: 0.396
a: 0.25519
f1: 0.393
a: 0.03107
f1: 0.418
a: 0.29646
f1: 0.394
a: 0.23759
f1: 0.394
a: 0.43274
f1: 0.392
a: 0.51171
f1: 0.39
a: 0.47773
f1: 0.391
a: 0.29784
f1: 0.394
a: 0.85857
f1: 0.382
a: 0.43179
f1: 0.392
a: 0.3804
f1: 0.392
a: 0.66253
f1: 0.387
a: 0.61478
f1: 0.389
a: 0.6703
f1: 0.388
a: 0.92828
f1: 0.38
a: 0.30974
f1: 

BernoulliNB(alpha=0.01263, binarize=0.0, class_prior=None, fit_prior=True)