# Assigment 3

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn import tree, svm, metrics
from sklearn.dummy import DummyClassifier
import string
from scipy import stats
from tqdm import tqdm_notebook as tqdm

In [1]:
path_to_data = "../data/"
path_to_gen = "../generated/"

# paht to imbd
imbd_train = "IMDB-train.txt"
imbd_val = "IMDB-valid.txt"
imbd_test = "IMDB-test.txt"

# paht to yelp
yelp_train = "yelp-train.txt"
yelp_val = "yelp-valid.txt"
yelp_test = "yelp-test.txt"

In [3]:
# get the data as df
df_imbd_train = pd.read_csv(path_to_data + imbd_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_test = pd.read_csv(path_to_data + imbd_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_imbd_val = pd.read_csv(path_to_data + imbd_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

df_yelp_train = pd.read_csv(path_to_data + yelp_train, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_test = pd.read_csv(path_to_data + yelp_test, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})
df_yelp_val = pd.read_csv(path_to_data + yelp_val, sep="\t",
                            header=None).rename(columns={0: "review", 1: "class"})

## Question 1

In [5]:
# count frequency of normalized x
def normalize_df(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, ' '.join(list(map(normalizer, np.array(e.review)))).split(' ')))

def normalize_str(e):
    translator = str.maketrans('', '', string.punctuation)
    normalizer = lambda x: x.lower().translate(translator) # lower and remove punctuation
    return list(filter(None, list(map(normalizer, e.split(' ')))))
    
    
def count_word_frequency(words):
    occurence = {}
    for w in words:
        if w in occurence:
            occurence[w] += 1
        else:
            occurence[w] = 1
    return occurence

def get_most_n_frequent(data, n):
    normed = normalize_df(data)
    count_dict = count_word_frequency(normed)
    n_sorted_words = (sorted(count_dict.items(), key=lambda kv: kv[1], reverse = True)[:n])
    return [k for k,v in n_sorted_words]
    
def gen_vec(data, data_train):
    n = 10000
    most_freq = []
    df_vector_b = pd.DataFrame(columns=['review', 'class'])
    df_vector_f = pd.DataFrame(columns=['review', 'class'])

    # check if we use the data to generate the train or not
    most_freq = get_most_n_frequent(data_train, n)
    
    for review, class_id in tqdm(zip(data.review, data['class']), total = len(data.review)):
        vector_b, vector_f = np.zeros(n), np.zeros(n)  
        sum_ = 0
        for word in normalize_str(review):
            if word in most_freq:
                i = most_freq.index(word)
                vector_b[i] = 1
                vector_f[i] += 1
                sum_ += 1
        vector_f = [float(e/sum_) for e in vector_f]
        df_vector_b = df_vector_b.append({'review': np.array(vector_b), 'class': class_id}, ignore_index=True)
        df_vector_f = df_vector_f.append({'review': np.array(vector_f), 'class': class_id}, ignore_index=True)
    return df_vector_b, df_vector_f

In [None]:
# binary bag-of-words representation
df_imbd_train_BBoW, df_imbd_train_FBoW = gen_vec(df_imbd_train, df_imbd_train)
df_yelp_train_BBoW, df_yelp_train_FBoW = gen_vec(df_yelp_train, df_yelp_train)
df_imbd_test_BBoW, df_imbd_test_FBoW = gen_vec(df_imbd_test, df_imbd_train)
df_yelp_test_BBoW, df_yelp_test_FBoW = gen_vec(df_yelp_test, df_yelp_train)
df_imbd_val_BBoW, df_imbd_val_FBoW = gen_vec(df_imbd_val, df_imbd_train)
df_yelp_val_BBoW, df_yelp_val_FBoW = gen_vec(df_yelp_val, df_yelp_train)









In [None]:
# save the dataframes just created
def save_gen(dataframes, type_, dataset_name):
    # dataframes in order [train, val, test]
    suffixes = ["-train-", "-valid-", "-test"]
    for df, s in zip(dataframes, suffixes)
        df.to_csv(path_to_gen + dataset_name + s + type_, header=None, index=None, sep=' ', mode='a')
        
save_gen([df_imbd_train_BBoW, df_imbd_val_BBoW, df_imbd_test_BBoW], "BBoW", "IMBD")
save_gen([df_yelp_train_BBoW, df_yelp_val_BBoW, df_yelp_test_BBoW], "BBoW", "yelp")
save_gen([df_imbd_train_FBoW, df_imbd_val_FBoW, df_imbd_test_FBoW], "FBoW", "IMBD")
save_gen([df_yelp_train_FBoW, df_yelp_val_FBoW, df_yelp_test_FBoW], "FBoW", "yelp")

In [7]:
def performance_random_clf(x_tr, y_tr, x_test, y_test, r = 1234):
    clf = DummyClassifier(strategy = 'uniform', random_state = r)
    clf.fit(x_tr, y_tr)
    preds = clf.predict(x_test)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1
    
def performance_majority_classifier(x_tr, y_tr, x_test, y_test, r = 1234):
    most_common_val = stats.mode(y_tr).mode[0]
    preds = np.full((y_test.shape), most_common_val)
    f1 = metrics.f1_score(y_test, preds, average = 'micro')
    return f1

## Question 2

Using data from **yelp** created with **BBoW** only.

In [14]:
x_tr = np.array([np.array(x) for x in df_yelp_train_BBoW['review'].values])
x_tr__ = np.array([np.array(x) for x in df_imbd_train_BBoW['review'].values])

y_tr = np.array([np.array(x) for x in df_yelp_train_BBoW['class'].values])
y_tr__ = np.array([np.array(x) for x in df_imbd_train_BBoW['class'].values])

x_test = np.array([np.array(x) for x in df_yelp_test_BBoW['review'].values])
y_test = np.array([np.array(x) for x in df_yelp_test_BBoW['class'].values])

### Part A

In [30]:
baseline_random = performance_random_clf(x_tr, y_tr, x_test, y_test)
baseline_majority = performance_majority_classifier(x_tr__, y_tr, x_test, y_test)

In [33]:
print("The performance for the random classifier on the Yelp " + 
      "reviews dataset created with BBoW is %s." % baseline_random)
print("The performance for the majority classifier on the Yelp " +
      "reviews dataset created with BBoW is %s." % baseline_majority)

The performance for the random classifier on the Yelp reviews dataset created with BBoW is 0.1975.
The performance for the majority classifier on the Yelp reviews dataset created with BBoW is 0.351.
