In [1]:
import re
import os
import random
os.chdir('/content/drive/MyDrive/ML/hw2')

from utils import *

import numpy as np
import pandas as pd

import scipy
from scipy import sparse

from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import normalize

from tqdm import tqdm
import progressbar
from itertools import chain
from collections import Counter
tqdm.pandas()

import pprint as pp
pp = pp.PrettyPrinter(indent=4)

In [4]:
train, test = load_data(merge=False)

print(f'Number of training samples: {train.shape[0]}')
print(f'Number of test samples: {test.shape[0]}')

Number of training samples: 1000000
Number of test samples: 320122


In [5]:
unigram_model = UnigramModel()

unigram_model.build_vocab(data=train, unigram_threshold=3000)
print(f'\n\nNumber of features in unigram vocabulary: {len(unigram_model.unigram_vocab)}')
print(f'Few sample features in unigram vocabulary:: {[word for word in unigram_model.unigram_vocab][300:305]}')

100%|██████████| 1000000/1000000 [00:46<00:00, 21574.53it/s]




Number of features in unigram vocabulary: 2440
Few sample features in unigram vocabulary:: ['bruschetta', 'btw', 'buck', 'bucks', 'buddy']


# Term Frequency - Unigram Model

In [6]:
train_uni_tf, idf = unigram_model.tf(data=train, type='train')
print(f'\n\nTerm Frequency unigram matrix shape (no lift) for train data: {train_uni_tf.shape}')

100%|██████████| 1000000/1000000 [04:16<00:00, 3902.09it/s]




Term Frequency unigram matrix shape (no lift) for train data: (1000000, 2440)


In [7]:
# train
clf_uni_tf = Perceptron(unigram_model.unigram_vocab_size)
clf_uni_tf.fit(train.label.tolist(), train_uni_tf)

print(f'\nWeight vector shape (lifted): {clf_uni_tf.w_avg.shape}')

100%|██████████| 1000000/1000000 [09:13<00:00, 1805.79it/s]
100%|██████████| 1000000/1000000 [10:00<00:00, 1665.42it/s]


Weight vector shape (lifted): (2441, 1)





In [8]:
tf_unigram_w = clf_uni_tf.w_avg[1:, :].reshape(-1,).tolist()
inv_vocab = {unigram_model.unigram_vocab[word]:word for word in unigram_model.unigram_vocab}

lowest_weights = sorted([[tf_unigram_w[i], i] for i in range(len(tf_unigram_w))], key=lambda x: x[0])[0:10]
lowest_weights_idx = [elem[1] for elem in lowest_weights]
lowest_weight_words = [inv_vocab[val] for val in lowest_weights_idx]

print(f'Words with lowest weights: \n{lowest_weight_words}')
print(f'\n\n{list(zip(lowest_weight_words, np.array(lowest_weights).T[0].round(2).tolist()))}')

highest_weights = sorted([[tf_unigram_w[i], i] for i in range(len(tf_unigram_w))], key=lambda x: x[0], reverse=True)[0:10]
highest_weights_idx = [elem[1] for elem in highest_weights]
highest_weights_words = [inv_vocab[val] for val in highest_weights_idx]

print(f'\n\n\nWords with highest weights: \n{highest_weights_words}')
print(f'\n\n{list(zip(highest_weights_words, np.array(highest_weights).T[0].round(2).tolist()))}')

Words with lowest weights: 
['worst', 'lacked', 'flavorless', 'mediocre', 'tasteless', 'disgusting', 'meh', 'hopes', 'disappointing', 'ruined']


[('worst', -195.13), ('lacked', -172.04), ('flavorless', -171.44), ('mediocre', -165.33), ('tasteless', -156.61), ('disgusting', -153.9), ('meh', -148.58), ('hopes', -140.19), ('disappointing', -138.8), ('ruined', -134.33)]



Words with highest weights: 
['perfection', 'heaven', 'gem', 'disappoint', 'heavenly', 'phenomenal', 'incredible', 'perfect', 'superb', 'perfectly']


[('perfection', 149.69), ('heaven', 131.65), ('gem', 129.63), ('disappoint', 124.89), ('heavenly', 124.74), ('phenomenal', 117.8), ('incredible', 117.78), ('perfect', 107.1), ('superb', 103.77), ('perfectly', 102.82)]


In [9]:
test_uni_tf = unigram_model.tf(data=test, type='test')
print(f'\n\nTerm Frequency unigram matrix shape (no lift) for test data: {test_uni_tf.shape}')

100%|██████████| 320122/320122 [00:56<00:00, 5637.48it/s]



Term Frequency unigram matrix shape (no lift) for test data: (320122, 2440)





In [10]:
# evaluate
train_scores = clf_uni_tf.evaluate(train.label.tolist(), train_uni_tf)
test_scores = clf_uni_tf.evaluate(test.label.tolist(), test_uni_tf)

print(f'TF unigram train scores:\t accuracy:{train_scores[0]:.3f}, precision:{train_scores[1]:.3f}, recall:{train_scores[2]:.3f}')
print(f'TF unigram test scores:\t accuracy:{test_scores[0]:.3f}, precision:{test_scores[1]:.3f}, recall:{test_scores[2]:.3f}')


TF unigram train scores:	 accuracy:0.889, precision:0.906, recall:0.929
TF unigram test scores:	 accuracy:0.888, precision:0.905, recall:0.928


# Term Frequencey Inverse Document Frequency (TF-IDF) - Unigram Model

In [11]:
train_uni_tfidf = unigram_model.tfidf(data=train, term_frequency_mat=train_uni_tf, idf_mat=idf)
print(f'\n\nTF-IDF unigram matrix shape (w/o lift) for train data: {train_uni_tfidf.shape}')




TF-IDF unigram matrix shape (w/o lift) for train data: (1000000, 2440)


In [12]:
# train
clf_uni_tfidf = Perceptron(unigram_model.unigram_vocab_size)
clf_uni_tfidf.fit(train.label.tolist(), train_uni_tfidf)

print(f'\nWeight vector shape (lifted): {clf_uni_tfidf.w_avg.shape}')

100%|██████████| 1000000/1000000 [08:45<00:00, 1901.34it/s]
100%|██████████| 1000000/1000000 [09:15<00:00, 1798.75it/s]


Weight vector shape (lifted): (2441, 1)





In [13]:
tfidf_unigram_w = clf_uni_tfidf.w_avg[1:, :].reshape(-1,).tolist()
inv_vocab = {unigram_model.unigram_vocab[word]:word for word in unigram_model.unigram_vocab}

lowest_weights = sorted([[tfidf_unigram_w[i], i] for i in range(len(tfidf_unigram_w))], key=lambda x: x[0])[0:10]
lowest_weights_idx = [elem[1] for elem in lowest_weights]
lowest_weight_words = [inv_vocab[val] for val in lowest_weights_idx]

print(f'Words with lowest weights: \n{lowest_weight_words}')
print(f'\n\n{list(zip(lowest_weight_words, np.array(lowest_weights).T[0].round(2).tolist()))}')

highest_weights = sorted([[tfidf_unigram_w[i], i] for i in range(len(tfidf_unigram_w))], key=lambda x: x[0], reverse=True)[0:10]
highest_weights_idx = [elem[1] for elem in highest_weights]
highest_weights_words = [inv_vocab[val] for val in highest_weights_idx]

print(f'\n\n\nWords with highest weights: \n{highest_weights_words}')
print(f'\n\n{list(zip(highest_weights_words, np.array(highest_weights).T[0].round(2).tolist()))}')

Words with lowest weights: 
['worst', 'mediocre', 'bland', 'flavorless', 'awful', 'horrible', 'tasteless', 'disappointing', 'meh', 'lacked']


[('worst', -7.69), ('mediocre', -7.03), ('bland', -6.06), ('flavorless', -5.69), ('awful', -5.61), ('horrible', -5.52), ('tasteless', -5.42), ('disappointing', -5.37), ('meh', -5.36), ('lacked', -5.29)]



Words with highest weights: 
['delicious', 'perfection', 'amazing', 'perfect', 'great', 'excellent', 'awesome', 'heavenly', 'gem', 'fantastic']


[('delicious', 6.92), ('perfection', 6.32), ('amazing', 6.01), ('perfect', 5.84), ('great', 5.39), ('excellent', 5.38), ('awesome', 5.24), ('heavenly', 5.13), ('gem', 5.05), ('fantastic', 4.86)]


In [14]:
test_uni_tfidf = unigram_model.tfidf(data=test, term_frequency_mat=test_uni_tf, idf_mat=idf)
print(f'\n\nTF-IDF unigram matrix shape (w/o lift) for test data: {test_uni_tfidf.shape}')



TF-IDF unigram matrix shape (w/o lift) for test data: (320122, 2440)


In [15]:
# evaluate
train_scores = clf_uni_tfidf.evaluate(train.label.tolist(), train_uni_tfidf)
test_scores = clf_uni_tfidf.evaluate(test.label.tolist(), test_uni_tfidf)

print(f'TF-IDF unigram train scores:\t accuracy:{train_scores[0]:.3f}, precision:{train_scores[1]:.3f}, recall:{train_scores[2]:.3f}')
print(f'TF-IDF unigram test scores:\t accuracy:{test_scores[0]:.3f}, precision:{test_scores[1]:.3f}, recall:{test_scores[2]:.3f}')


TF-IDF unigram train scores:	 accuracy:0.890, precision:0.905, recall:0.932
TF-IDF unigram test scores:	 accuracy:0.889, precision:0.904, recall:0.931
