In [1]:
import re
import os
import random
os.chdir('/content/drive/MyDrive/ML/hw2')

from utils import *

import numpy as np
import pandas as pd

import scipy
from scipy import sparse

from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.preprocessing import normalize

from tqdm import tqdm
import progressbar
from itertools import chain
from collections import Counter
tqdm.pandas()

import pprint as pp
pp = pp.PrettyPrinter(indent=4)

In [2]:
train, test = load_data(merge=False)

print(f'Number of training samples: {train.shape[0]}')
print(f'Number of test samples: {test.shape[0]}')

Number of training samples: 1000000
Number of test samples: 320122


In [3]:
bigram_model = BigramModel()

bigram_model.build_vocab(data=train, unigram_threshold=3000, bigram_threshold=3000)
print(f'\n\nNumber of features in unigram bigram vocabulary: {len(bigram_model.unigram_bigram_vocab)}')
print(f'Few sample features in unigram bigram vocabulary:: {[word for word in bigram_model.unigram_bigram_vocab][300:305]}')

100%|██████████| 1000000/1000000 [00:46<00:00, 21614.79it/s]
100%|██████████| 1000000/1000000 [00:42<00:00, 23260.21it/s]




Number of features in unigram bigram vocabulary: 6549
Few sample features in unigram bigram vocabulary:: ['and my', 'and never', 'and nice', 'and no', 'and not']


# Term Frequency - Unigram and Bigram Model

In [4]:
train_unibi_tf, idf = bigram_model.tf(data=train, type='train')
print(f'\n\nTerm Frequency unigram bigram matrix shape (no lift) for train data: {train_unibi_tf.shape}')

100%|██████████| 1000000/1000000 [08:16<00:00, 2014.47it/s]




Term Frequency unigram bigram matrix shape (no lift) for train data: (1000000, 6549)


In [5]:
# train
clf_unibi_tf = Perceptron(bigram_model.unigram_bigram_vocab_size)
clf_unibi_tf.fit(train.label.tolist(), train_unibi_tf)

print(f'\nWeight vector shape (lifted): {clf_unibi_tf.w_avg.shape}')

100%|██████████| 1000000/1000000 [08:21<00:00, 1993.91it/s]
100%|██████████| 1000000/1000000 [09:14<00:00, 1802.48it/s]


Weight vector shape (lifted): (6550, 1)





In [6]:
test_unibi_tf = bigram_model.tf(data=test, type='test')
print(f'\n\nTerm Frequency unigram bigram matrix shape (no lift) for test data: {test_unibi_tf.shape}')

100%|██████████| 320122/320122 [02:00<00:00, 2653.58it/s]




Term Frequency unigram bigram matrix shape (no lift) for test data: (320122, 6549)


In [7]:
# evaluate
train_scores = clf_unibi_tf.evaluate(train.label.tolist(), train_unibi_tf)
test_scores = clf_unibi_tf.evaluate(test.label.tolist(), test_unibi_tf)

print(f'TF unigram bigram train scores:\t accuracy:{train_scores[0]:.3f}, precision:{train_scores[1]:.3f}, recall:{train_scores[2]:.3f}')
print(f'TF unigram bigram test scores:\t accuracy:{test_scores[0]:.3f}, precision:{test_scores[1]:.3f}, recall:{test_scores[2]:.3f}')


TF unigram bigram train scores:	 accuracy:0.904, precision:0.919, recall:0.938
TF unigram bigram test scores:	 accuracy:0.902, precision:0.917, recall:0.936


# Term Frequencey Inverse Document Frequency (TF-IDF) - Unigram & Bigram Model

In [9]:
train_unibi_tfidf = bigram_model.tfidf(data=train, term_frequency_mat=train_unibi_tf, idf_mat=idf)
print(f'\n\nTF-IDF unigram bigram matrix shape (w/o lift) for train data: {train_unibi_tfidf.shape}')




TF-IDF unigram bigram matrix shape (w/o lift) for train data: (1000000, 6549)


In [10]:
# train
clf_unibi_tfidf = Perceptron(bigram_model.unigram_bigram_vocab_size)
clf_unibi_tfidf.fit(train.label.tolist(), train_unibi_tfidf)

print(f'\nWeight vector shape (lifted): {clf_unibi_tfidf.w_avg.shape}')

100%|██████████| 1000000/1000000 [08:47<00:00, 1896.30it/s]
100%|██████████| 1000000/1000000 [09:24<00:00, 1770.95it/s]


Weight vector shape (lifted): (6550, 1)





In [11]:
test_unibi_tfidf = bigram_model.tfidf(data=test, term_frequency_mat=test_unibi_tf, idf_mat=idf)
print(f'\n\nTF-IDF unigram bigram matrix shape (w/o lift) for test data: {test_unibi_tfidf.shape}')



TF-IDF unigram bigram matrix shape (w/o lift) for test data: (320122, 6549)


In [13]:
# evaluate
train_scores = clf_unibi_tfidf.evaluate(train.label.tolist(), train_unibi_tfidf)
test_scores = clf_unibi_tfidf.evaluate(test.label.tolist(), test_unibi_tfidf)

print(f'TF-IDF unigram bigram train scores:\t accuracy:{train_scores[0]:.3f}, precision:{train_scores[1]:.3f}, recall:{train_scores[2]:.3f}')
print(f'TF-IDF unigram bigram test scores:\t accuracy:{test_scores[0]:.3f}, precision:{test_scores[1]:.3f}, recall:{test_scores[2]:.3f}')


TF-IDF unigram bigram train scores:	 accuracy:0.905, precision:0.918, recall:0.941
TF-IDF unigram bigram test scores:	 accuracy:0.902, precision:0.916, recall:0.939
