# **HW2 - Text Classification**

## 1. IMDB Dataset

In [1]:
from keras.datasets import imdb
INDEX_FROM = 3
(x_train, y_train), (x_test, y_test) = imdb.load_data(index_from=INDEX_FROM, num_words=5000)

word_index = imdb.get_word_index()
# Reverse the word index to obtain a dict mapping indices to words
word_index = {k:(v+INDEX_FROM) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2
word_index["<UNUSED>"] = 3
inverted_word_index = dict((i, word) for (word, i) in word_index.items())

# Decode the first sequence in the dataset
print("Example of dataset: ")
print("Text: ", " ".join(inverted_word_index[i] for i in x_train[0]))
print("Sentiment: ", y_train.astype(bool)[0])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
Example of dataset: 
Text:  <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly <UNK> was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little <UNK> that played the <UNK> of norman and paul they we

## 2. Pre-Processing

### 2.1. Any data cleaning

In [2]:
import nltk
nltk.download('punkt')

import numpy as np
import string

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
train_text = []

# remove punctuation
for x in x_train:
  text = " ".join(inverted_word_index[i] for i in x[1:])
  text = text.translate(str.maketrans('', '', string.punctuation))
  train_text.append(text)

test_text = []

# remove punctuation
for x in x_test:
  text = " ".join(inverted_word_index[i] for i in x[1:])
  text = text.translate(str.maketrans('', '', string.punctuation))
  test_text.append(text)

In [4]:
# other cleaning is done after tokenization

from nltk.corpus import stopwords
nltk.download('stopwords')
english_stopwords = list(set(stopwords.words('english')))

from nltk.stem.snowball import SnowballStemmer
stemmer=SnowballStemmer('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### 2.2. Tokenization

In [5]:
from nltk.tokenize import word_tokenize

train_tokens = []
for text in train_text:
  tokens = word_tokenize(text)
  # remove stop words
  stemmed_tokens=[(stemmer.stem(i)).lower() for i in tokens]
  filtered_tokens = [token for token in stemmed_tokens if token not in english_stopwords]
  train_tokens.append(filtered_tokens)


test_tokens = []
for text in test_text:
  tokens = word_tokenize(text)
  # remove stop words
  stemmed_tokens=[(stemmer.stem(i)).lower() for i in tokens]
  filtered_tokens = [token for token in stemmed_tokens if token not in english_stopwords]
  test_tokens.append(filtered_tokens)


### 2.3. Stemming

In [None]:
# done in previus part

## 3. Build Models

### 3.1. Uni-Gram

In [11]:
class classifier:
  def __init__(self, train_tokens, y, classes_n=2):
    self.classes_n = classes_n
    self.train_tokens = train_tokens
    self.y = y
    # self.p_ci[0] = neg
    # self.p_ci[1] = pos
    
    # creating vocab
    self.vocab, self.vocab_pos, self.vocab_neg = self.build_vocab()
    self.vocab["unk"]=0
    self.vocab_pos["unk"]=0
    self.vocab_neg["unk"]=0

    # number of words in each class
    self.ns = np.zeros((2,1))
    self.ns[1] = sum(self.vocab_pos.values())
    self.ns[0] = sum(self.vocab_neg.values())

    # index to word and word to index arrays
    self.itow = list(self.vocab.keys())
    self.wtoi = {w:idx for idx,w in enumerate(self.itow)}

    # calculating probabilities
    self.p_ci = np.zeros((classes_n,1))
    self.p_wk_cj = np.zeros((len(self.vocab), classes_n))

    self.find_p_ci()
    self.find_p_wk_cj()


  def build_vocab(self):
    vocab = dict()
    vocab_pos = dict()
    vocab_neg = dict()

    for i in range(len(self.train_tokens)):
      tokens = self.train_tokens[i]
      for token in tokens:
        # add token to general vocab
        n = vocab.get(token, 0)
        vocab[token] = n + 1
        # add token to its class
        if (self.y[i] == 0):
          n = vocab_neg.get(token, 0)
          vocab_neg[token] = n + 1
        else:
          n = vocab_pos.get(token, 0)
          vocab_pos[token] = n + 1

    return vocab, vocab_pos, vocab_neg

  def find_p_ci(self):
    pos_n = np.count_nonzero(self.y)
    neg_n = len(self.y) - pos_n
    self.p_ci[1] = pos_n / len(self.y)
    self.p_ci[0] = 1 - self.p_ci[1]

  def find_p_wk_cj(self):
    words = self.itow

    # negative, j = 0
    for k in range(len(words)):
      w = words[k]
      self.p_wk_cj[k, 0] = ((self.vocab_neg.get(w, 0)+1) / (self.ns[0] + len(self.itow)+1))

    # positive, j = 1
    for k in range(len(words)):
      w = words[k]
      self.p_wk_cj[k, 1] = ((self.vocab_pos.get(w, 0)+1) / (self.ns[1] + len(self.itow)+1))


  def predict(self, test_tokens):
    ps = []
    pred = []
    for test in test_tokens:
      p = np.ones((self.classes_n, 1))
      # add probability of class
      p[0] = self.p_ci[0]
      p[1] = self.p_ci[1]
      for token in test:
        for c in range(self.classes_n):
          if token in self.wtoi:
            token_idx = self.wtoi[token]
            # add (instead of multiply because of log) probabiliy of word belonging to class c
            p[c] += (np.log(self.p_wk_cj[token_idx, c]))
          else:
            # add 1/dominator if token doesnt exist in vocab (no occurence in train set)
            p[c] += (np.log(1/(self.ns[c] + len(self.itow)+1)))
            
      ps.append(p)
      pred.append(np.argmax(p))
    return np.array(ps), np.array(pred)


# use classifier on tokens (unigrams)
unigram_classifier = classifier(train_tokens, y_train)
uni_ps, uni_pred = unigram_classifier.predict(test_tokens)

### 3.2. Bi-Gram


In [12]:
# create bigrams
train_bigrams = []
for tokens in train_tokens:
  bigrams = []
  for i in range(len(tokens)-1):
    bigrams.append(f"{tokens[i]} {tokens[i+1]}")
  train_bigrams.append(bigrams)

test_bigrams = []
for tokens in test_tokens:
  bigrams = []
  for i in range(len(tokens)-1):
    bigrams.append(f"{tokens[i]} {tokens[i+1]}")
  test_bigrams.append(bigrams)

# use classifier on bigrams
bigram_classifier = classifier(train_bigrams, y_train)
bi_ps, bi_pred = bigram_classifier.predict(test_bigrams)

### 3.3. Tri-Gram

In [13]:
# create trigrams
train_trigrams = []
for tokens in train_tokens:
  trigrams = []
  for i in range(len(tokens)-2):
    trigrams.append(f"{tokens[i]} {tokens[i+1]} {tokens[i+2]}")
  train_trigrams.append(trigrams)

test_trigrams = []
for tokens in test_tokens:
  trigrams = []
  for i in range(len(tokens)-2):
    trigrams.append(f"{tokens[i]} {tokens[i+1]} {tokens[i+2]}")
  test_trigrams.append(trigrams)

# use classifier on trigrams
trigram_classifier = classifier(train_trigrams, y_train)
tri_ps, tri_pred = trigram_classifier.predict(test_trigrams)

## 4. Evaluate Model

In [14]:
def evaluate(truth, pred):
    # true positive, truth = 1 and pred = 1
    TP = np.sum(np.logical_and(pred == 1, truth == 1))

    # condition positive
    P = np.sum(truth)

    recall = TP/P

    # false positive, truth = 0 but pred = 0
    FP = np.sum(np.logical_and(pred == 1, truth == 0))

    precision = TP / (TP + FP)

    f1score = 2*((precision*recall)/(precision+recall))
    
    # true negative, truth = 0 and pred = 0
    TN = np.sum(np.logical_and(pred == 0, truth == 0))

    
    accuracy = (TP + TN) / len(truth)

    return recall, precision, f1score, accuracy


from tabulate import tabulate

recall1, precision1, f1score1, accuracy1 = evaluate(y_test, np.array(uni_pred))
recall2, precision2, f1score2, accuracy2 = evaluate(y_test, np.array(bi_pred))
recall3, precision3, f1score3, accuracy3 = evaluate(y_test, np.array(tri_pred))
print(tabulate([["unigram", recall1, precision1, f1score1, accuracy1],
                ["bigram", recall2, precision2, f1score2, accuracy2],
                ["trigram", recall3, precision3, f1score3, accuracy3]], 
               headers=["model", "recall", "precision", "f1score", "accuracy"]))

model      recall    precision    f1score    accuracy
-------  --------  -----------  ---------  ----------
unigram   0.81504     0.851555   0.832897     0.83648
bigram    0.8516      0.866363   0.858918     0.86012
trigram   0.79064     0.812479   0.801411     0.80408


In [15]:
uni_incorrect_pred = np.arange(len(test_tokens))[uni_pred != y_test]
bi_incorrect_pred = np.arange(len(test_tokens))[uni_pred != y_test]
tri_incorrect_pred = np.arange(len(test_tokens))[uni_pred != y_test]

def compare_tokens(tokens, classifier):
  tokens = set(tokens)
  for token in tokens:
    if (token not in classifier.vocab_pos or token not in classifier.vocab_neg):
      continue
    if (classifier.vocab_pos[token] == classifier.vocab_neg[token]):
      continue
    print(f"token {token}, pos count: {classifier.vocab_pos[token]}, neg count: {classifier.vocab_neg[token]}")

print("-----------------------------------------------------")
print("unigram incorrect prediction | sentiment: ", y_test[uni_incorrect_pred[0]], "; text: ")
print(" ".join(test_tokens[uni_incorrect_pred[0]]))
compare_tokens(test_tokens[uni_incorrect_pred[0]], unigram_classifier)
print("-----------------------------------------------------")
print("bigram incorrect prediction | sentiment: ", y_test[bi_incorrect_pred[0]], "; text: ")
print(" ".join(test_tokens[bi_incorrect_pred[0]]))
compare_tokens(test_bigrams[bi_incorrect_pred[0]], bigram_classifier)
print("-----------------------------------------------------")
print("trigram incorrect prediction | sentiment: ", y_test[tri_incorrect_pred[0]], "; text: ")
print(" ".join(test_tokens[tri_incorrect_pred[0]]))
compare_tokens(test_trigrams[tri_incorrect_pred[0]], trigram_classifier)

-----------------------------------------------------
unigram incorrect prediction | sentiment:  1 ; text: 
im absolut unk movi isnt sold love movi unk disney unk demand theyd eventu sell id buy copi everybodi know everyth everybodi movi good job havent figur whi disney hasnt put movi dvd vhs rental unk least havent seen ani copi wick good movi seen kid new generat dont get see think least put back channel movi doesnt deserv cheap unk deserv real thing im movi dvd
token job, pos count: 1565, neg count: 894
token theyd, pos count: 49, neg count: 84
token put, pos count: 1529, neg count: 1599
token hasnt, pos count: 208, neg count: 163
token love, pos count: 6142, neg count: 2817
token get, pos count: 6459, neg count: 7638
token havent, pos count: 436, neg count: 363
token cheap, pos count: 201, neg count: 689
token isnt, pos count: 1328, neg count: 1847
token think, pos count: 4290, neg count: 4619
token buy, pos count: 476, neg count: 467
token absolut, pos count: 797, neg count: 1040


## Good Luck!