# Assigment 3

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn import tree, svm, metrics
from sklearn.dummy import DummyClassifier
import string
from scipy import stats
import random
import os
from tqdm import tqdm_notebook as tqdm

In [2]:
path_to_data = "../data/"
path_to_gen = "../generated/"

# paht to imbd
imbd_train = "IMDB-train.txt"
imbd_val = "IMDB-valid.txt"
imbd_test = "IMDB-test.txt"

# paht to yelp
yelp_train = "yelp-train.txt"
yelp_val = "yelp-valid.txt"
yelp_test = "yelp-test.txt"

In [3]:
# get the data as df
df_imbd_train = pd.read_csv(path_to_data + imbd_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_test = pd.read_csv(path_to_data + imbd_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_val = pd.read_csv(path_to_data + imbd_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

df_yelp_train = pd.read_csv(path_to_data + yelp_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_test = pd.read_csv(path_to_data + yelp_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_val = pd.read_csv(path_to_data + yelp_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

## Question 1

In [4]:
# count frequency of normalized x
def normalize_df(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, ' '.join(list(map(normalizer, np.array(e.review)))).split(' ')))

def normalize_str(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, list(map(normalizer, e.split(' ')))))
    
def count_word_frequency(words):
    occurence = {}
    for w in words:
        if w in occurence:
            occurence[w] += 1
        else:
            occurence[w] = 1
    return occurence

def get_most_n_frequent(data, n = 10000):
    normed = normalize_df(data)
    count_dict = count_word_frequency(normed)
    n_sorted_words = (sorted(count_dict.items(), key=lambda kv: kv[1], reverse = True)[:n])
    return [k for k,v in n_sorted_words]
    
def gen_vec(data, data_train, most_freq):
    all_vector_b = []
    all_vector_f = []
    for review, class_id in tqdm(zip(data.review, data['class']), total = len(data.review)):
        vector_b, vector_f = list(np.zeros(10000)), list(np.zeros(10000))
        sum_ = 0
        norm_rev = normalize_str(review)
        for word in norm_rev:
            try:
                i = most_freq.index(word)
                vector_b[i] = 1
                vector_f[i] += 1
                sum_ += 1
            except:
                pass
        vector_f = [float(e/sum_) for e in vector_f]
        vector_b.append(int(class_id))
        vector_f.append(int(class_id))
        all_vector_b.append(np.array(vector_b))
        all_vector_f.append(np.array(vector_f))
    
    return np.array(all_vector_b).astype(int), np.array(all_vector_f)

In [5]:
imbd_most_freq = get_most_n_frequent(df_imbd_train)
yelp_most_freq = get_most_n_frequent(df_yelp_train)

In [6]:
# binary bag-of-words representation
df_imbd_train_BBoW, df_imbd_train_FBoW = gen_vec(df_imbd_train, df_imbd_train, imbd_most_freq)
df_yelp_train_BBoW, df_yelp_train_FBoW = gen_vec(df_yelp_train, df_yelp_train, yelp_most_freq)
df_imbd_test_BBoW, df_imbd_test_FBoW = gen_vec(df_imbd_test, df_imbd_train, imbd_most_freq)
df_yelp_test_BBoW, df_yelp_test_FBoW = gen_vec(df_yelp_test, df_yelp_train, yelp_most_freq)
df_imbd_val_BBoW, df_imbd_val_FBoW = gen_vec(df_imbd_val, df_imbd_train, imbd_most_freq)
df_yelp_val_BBoW, df_yelp_val_FBoW = gen_vec(df_yelp_val, df_yelp_train, yelp_most_freq)





















In [7]:
# save the dataframes just created
def save_gen(arrays, type_, dataset_name):
    # dataframes in order [train, val, test]
    suffixes = ["-train-", "-valid-", "-test-"]
    for a, s in zip(arrays, suffixes):
        np.savetxt(path_to_gen + dataset_name + s + type_ + '.txt', a, fmt='%s')
        
save_gen([df_imbd_train_BBoW, df_imbd_val_BBoW, df_imbd_test_BBoW], "BBoW", "IMBD")
save_gen([df_yelp_train_BBoW, df_yelp_val_BBoW, df_yelp_test_BBoW], "BBoW", "yelp")
save_gen([df_imbd_train_FBoW, df_imbd_val_FBoW, df_imbd_test_FBoW], "FBoW", "IMBD")
save_gen([df_yelp_train_FBoW, df_yelp_val_FBoW, df_yelp_test_FBoW], "FBoW", "yelp")

In [3]:
df_imbd_train_BBoW = np.loadtxt(path_to_gen + "IMBD-train-BBoW.txt")

In [4]:
df_imbd_train_FBoW = np.loadtxt(path_to_gen + "IMBD-train-FBoW.txt")

In [5]:
df_yelp_train_BBoW = np.loadtxt(path_to_gen + "yelp-train-BBoW.txt")

In [6]:
df_yelp_train_FBoW = np.loadtxt(path_to_gen + "yelp-train-FBoW.txt")

In [7]:
df_imbd_test_BBoW = np.loadtxt(path_to_gen + "IMBD-test-BBoW.txt")

In [8]:
df_imbd_test_FBoW = np.loadtxt(path_to_gen + "IMBD-test-FBoW.txt")

In [9]:
df_yelp_test_BBoW = np.loadtxt(path_to_gen + "yelp-test-BBoW.txt")

In [10]:
df_yelp_test_FBoW = np.loadtxt(path_to_gen + "yelp-test-FBoW.txt")

In [11]:
df_imbd_val_BBoW = np.loadtxt(path_to_gen + "IMBD-valid-BBoW.txt")

In [12]:
df_imbd_val_FBoW = np.loadtxt(path_to_gen + "IMBD-valid-FBoW.txt")

In [13]:
df_yelp_val_BBoW = np.loadtxt(path_to_gen + "yelp-valid-BBoW.txt")

In [14]:
df_yelp_val_FBoW = np.loadtxt(path_to_gen + "yelp-valid-FBoW.txt")

## Question 2

Using data from **yelp** created with **BBoW** only.

In [37]:
x_tr = df_yelp_train_BBoW[:,:-1]
y_tr = df_yelp_train_BBoW[:,-1]

x_val = df_yelp_val_BBoW[:,:-1]
y_val = df_yelp_val_BBoW[:,-1]

x_test = df_yelp_test_BBoW[:,:-1]
y_test = df_yelp_test_BBoW[:,-1]

### Part A

In [40]:
def performance_random_clf(x_tr, y_tr, x_test, y_test, r = 1234):
    clf = DummyClassifier(strategy = 'uniform', random_state = r)
    clf.fit(x_tr, y_tr)
    preds = clf.predict(x_test)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1
    
def performance_majority_classifier(x_tr, y_tr, x_test, y_test, r = 1234):
    most_common_val = stats.mode(y_tr).mode[0]
    preds = np.full((y_test.shape), most_common_val)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1

In [41]:
baseline_random = performance_random_clf(x_tr, y_tr, x_test, y_test)
baseline_majority = performance_majority_classifier(x_tr, y_tr, x_test, y_test)

In [42]:
print("The performance for the random classifier on the Yelp " + 
      "reviews dataset created with BBoW is %s." % baseline_random)
print("The performance for the majority classifier on the Yelp " +
      "reviews dataset created with BBoW is %s." % baseline_majority)

The performance for the random classifier on the Yelp reviews dataset created with BBoW is 0.1975.
The performance for the majority classifier on the Yelp reviews dataset created with BBoW is 0.351.


### Part B

In [43]:
def fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val, number_of_model = 20, alphas = [0, 100000]):
    f1_scores = []
    best_model = None
    for a in random.sample(range(alphas[0], alphas[1]), number_of_model):
        a = a/100000
        model = BernoulliNB(alpha = a)
        model.fit(x_tr, y_tr)
        preds = model.predict(x_val)
        cur_f1 = metrics.f1_score(y_val, preds, average = 'micro')
        f1_scores.append(cur_f1)
        if cur_f1 == max(f1_scores):
            best_model = model
    print("The best f1 score is %s with parameters set to: %s." % (max(f1_scores), best_model.get_params()))
    return best_model

In [44]:
fine_tune_bernoulli_naive_bayes(x_tr, y_tr, x_val, y_val)

The best f1 score is 0.421 with parameters set to: {'alpha': 0.0163, 'binarize': 0.0, 'class_prior': None, 'fit_prior': True}.


BernoulliNB(alpha=0.0163, binarize=0.0, class_prior=None, fit_prior=True)