# Assignment-2 Solutions for CS565
This notebook contains the solutions for assignment on topics N-Gram Language Models, Smoothing and Vector Semantics

In [None]:
# Downloading corpora
!wget -c 'https://www.dropbox.com/s/1agrh5hdnkqd24c/en_wiki.txt?dl=0' -O en_wiki.txt

--2020-11-05 14:49:07--  https://www.dropbox.com/s/1agrh5hdnkqd24c/en_wiki.txt?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.1, 2620:100:601d:1::a27d:501
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/1agrh5hdnkqd24c/en_wiki.txt [following]
--2020-11-05 14:49:07--  https://www.dropbox.com/s/raw/1agrh5hdnkqd24c/en_wiki.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc12347d7ffc7302099da0212883.dl.dropboxusercontent.com/cd/0/inline/BCoc0kKRN2WLRiiFCGY5s0vvwNwV9jjElGnIL4gg_eGwTQQZEJ9HzchQh4wnGvQCjJ4dI_5NBowovAJjEEk03ix6UMipsiVxdC24Gd88QUOdHe5Qkz4fUDo_qKMEO1_yjgs/file# [following]
--2020-11-05 14:49:08--  https://uc12347d7ffc7302099da0212883.dl.dropboxusercontent.com/cd/0/inline/BCoc0kKRN2WLRiiFCGY5s0vvwNwV9jjElGnIL4gg_eGwTQQZEJ9HzchQh4wnGvQCjJ4dI_5NBowovAJjEEk03ix6UMipsiVxdC24Gd88QUOdHe5Q

In [None]:
# imprting all required libraries

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import *
from nltk.util import ngrams
from collections import Counter, defaultdict

import random
from math import floor
import numpy as np
import copy

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Dividing training and test set

In [None]:
# Reading text from given corpus

text = open('/content/en_wiki.txt').read()

# tokenizing corpora and shuffling the list
sent_tokenize_list = sent_tokenize(text)
sent_tokenize_list = sent_tokenize_list[0:1500]
random.seed(4)
random.shuffle(sent_tokenize_list)

data = []
for sent in sent_tokenize_list:
  tokenized_sentence = ["START", "START"]
  words = word_tokenize(sent)
  for word in words:
    if word.isalnum():
      tokenized_sentence.append(word)
  if len(tokenized_sentence) > 2:
    tokenized_sentence.append("STOP")
    data.append(tokenized_sentence)

# print(data)
# dividing corpora into training and test sets

division_index = floor(0.9*len(data))
train_dev_set = data[0:division_index]
test_set = data[division_index:]

# print(len(data))
# print(data[:10])
# print(len(train_dev_set))
# print(train_dev_set[:10])
# print(len(test_set))
# print(test_set[:10])

## Trigram language model using interpolation smoothing

In [None]:
# Steps:
# 1. generate vocabulary
# 2. unknown word mapping
# 3. get lambda from training set
# 4. calculate perplexity

In [None]:
def get_vocabulary(data, freq_threshold):

  vocabulary_dict = defaultdict(int)
  for sent in data:
    for word in sent:
      vocabulary_dict[word] += 1

  temp_vocab = defaultdict(int)
  for key, value in vocabulary_dict.items():
    if value <= freq_threshold:
      temp_vocab["UNKNOWN"] += 1
    else:
      temp_vocab[key] = value

  return list(temp_vocab.keys())

vocab = get_vocabulary(train_dev_set, 6)
# print(vocab)
# print(len(vocab))

In [None]:
def map_unknown_words(data, vocabulary):
  no_of_sentences = len(data)
  for row in range(no_of_sentences):
    no_of_words = len(data[row])
    for col in range(no_of_words):
      if data[row][col] not in vocabulary:
        data[row][col] = "UNKNOWN"

# tdata = [["START", "START", "my", "name", "is", "Umang", "STOP"], ["START", "START", "name", "am", "21", "years", "is", "old", "STOP"]]
# map_unknown_words(tdata, vocab)
# print(tdata)

In [None]:
def calculate_perplexity(lambda1, lambda2, lambda3, test_set, count_trigram_train, count_bigram_train, count_unigram_train, no_of_words_train):
  
  trigram_list_test = []
  for sent in test_set:
    trigram_list_test.extend(list(nltk.trigrams(sent)))

  count_trigram_test = defaultdict(int,Counter(trigram_list_test))

  test_trigram_count_array = []
  trigram_prob_array = []
  bigram_prob_array = []
  unigram_prob_array = []

  trigram_list = list(count_trigram_test.keys())
  for trigram in trigram_list:

    count = count_trigram_test[trigram]
    u = trigram[0]
    v = trigram[1]
    w = trigram[2]

    test_trigram_count_array.append(count)

    if count_bigram_train[(u,v)] != 0:
      trigram_prob_array.append(count_trigram_train[trigram]/count_bigram_train[(u,v)])
    else:
      trigram_prob_array.append(0)

    if count_unigram_train[v] != 0:
      bigram_prob_array.append(count_bigram_train[(v, w)]/count_unigram_train[v])
    else:
      bigram_prob_array.append(0)

    unigram_prob_array.append(count_unigram_train[w]/no_of_words_train)

    prob = lambda1*np.array(trigram_prob_array) + lambda2*np.array(bigram_prob_array) + lambda3*np.array(unigram_prob_array)
    log_term = np.ma.log2(prob)
    log_term = log_term.filled(0)

    no_of_words_test = sum(len(sent)-3 for sent in test_set)
    L = np.dot(np.array(test_trigram_count_array), log_term)/no_of_words_test

    return pow(2,-1*L), L

In [None]:
def get_trained_lambdas(train_set, dev_set, test_set):

  trigram_list_train = []
  bigram_list_train = []
  unigram_list_train = []

  for sent in train_set:
    unigram_list_train.extend(sent)
    bigram_list_train.extend(list(nltk.bigrams(sent)))
    trigram_list_train.extend(list(nltk.trigrams(sent)))

  count_trigram_train = defaultdict(int,Counter(trigram_list_train))
  count_bigram_train = defaultdict(int,Counter(bigram_list_train))
  count_unigram_train = defaultdict(int,Counter(unigram_list_train))

  trigram_list_dev = []
  for sent in dev_set:
    trigram_list_dev.extend(list(nltk.trigrams(sent)))

  count_trigram_dev = defaultdict(int,Counter(trigram_list_dev))

  dev_trigram_count_array = []
  trigram_prob_array = []
  bigram_prob_array = []
  unigram_prob_array = []

  no_of_words_train = sum(len(sent)-3 for sent in train_set)

  trigram_list = list(count_trigram_dev.keys())
  for trigram in trigram_list:

    count = count_trigram_dev[trigram]
    u = trigram[0]
    v = trigram[1]
    w = trigram[2]

    dev_trigram_count_array.append(count)

    if count_bigram_train[(u,v)] != 0:
      trigram_prob_array.append(count_trigram_train[trigram]/count_bigram_train[(u,v)])
    else:
      trigram_prob_array.append(0)

    if count_unigram_train[v] != 0:
      bigram_prob_array.append(count_bigram_train[(v, w)]/count_unigram_train[v])
    else:
      bigram_prob_array.append(0)

    unigram_prob_array.append(count_unigram_train[w]/no_of_words_train)

  maxL = float('-inf')
  lambda1 = -1
  lambda2 = -1
  lambda3 = -1

  step = 0.1
  for l1 in np.arange(0.1,1,step):
    for l2 in np.arange(0,1-l1,step):
      l3 = 1-l1-l2

      prob = l1*np.array(trigram_prob_array) + l2*np.array(bigram_prob_array) + l3*np.array(unigram_prob_array)
      log_term = np.ma.log2(prob)
      log_term = log_term.filled(0)

      L = np.dot(np.array(dev_trigram_count_array), log_term)

      if L > maxL:
        # print('Updating maxL to ',L)
        maxL = L
        lambda1 = l1
        lambda2 = l2
        lambda3 = l3

  perplexity_dev, log_likelihood_dev = calculate_perplexity(lambda1, lambda2, lambda3, dev_set, count_trigram_train, count_bigram_train, count_unigram_train, no_of_words_train)
  perplexity_test, log_likelihood_test = calculate_perplexity(lambda1, lambda2, lambda3, test_set, count_trigram_train, count_bigram_train, count_unigram_train, no_of_words_train)
  return lambda1, lambda2, lambda3, perplexity_dev, perplexity_test, log_likelihood_dev, log_likelihood_test

In [None]:
map_unknown_words(train_dev_set, vocab)
map_unknown_words(test_set, vocab)

In [None]:
# print(len(train_dev_set))
# print(train_dev_set[:100])
# print(len(test_set))
# print(test_set[:100])

In [None]:
for i in range(5):

  data = copy.deepcopy(train_dev_set)
  random.shuffle(data)

  division_index = floor(0.9*len(data))
  train_set = data[:division_index]
  dev_set = data[division_index:]
  test_set_copy = copy.deepcopy(test_set)

  lambda1, lambda2, lambda3, perplexity_dev, perplexity_test, log_likelihood_dev, log_likelihood_test = get_trained_lambdas(train_set, dev_set, test_set_copy)

  print('Iteration Number: ', i+1)
  print('(lambda1, lambda2, lambda3) = (',round(lambda1,2),',',round(lambda2,2),',',round(lambda3,2),')')
  print('Log-likelihood for validation set =', log_likelihood_dev)
  print('Perplexity for validation set =', perplexity_dev)
  print('Log-likelihood for test set =', log_likelihood_test)
  print('Perplexity for test set =', perplexity_test)
  print('')

Iteration Number:  1
(lambda1, lambda2, lambda3) = ( 0.1 , 0.5 , 0.4 )
Log-likelihood for validation set = -0.005868830617309913
Perplexity for validation set = 1.0040762487897499
Log-likelihood for test set = -0.03490225060198852
Perplexity for test set = 1.0244874068241732

Iteration Number:  2
(lambda1, lambda2, lambda3) = ( 0.1 , 0.5 , 0.4 )
Log-likelihood for validation set = -0.030006145581610702
Perplexity for validation set = 1.02101647501632
Log-likelihood for test set = -0.03503212004305289
Perplexity for test set = 1.0245796339350888

Iteration Number:  3
(lambda1, lambda2, lambda3) = ( 0.1 , 0.5 , 0.4 )
Log-likelihood for validation set = -0.028970345760187248
Perplexity for validation set = 1.0202836873540269
Log-likelihood for test set = -0.0354895794367385
Perplexity for test set = 1.0249045660118785

Iteration Number:  4
(lambda1, lambda2, lambda3) = ( 0.1 , 0.5 , 0.4 )
Log-likelihood for validation set = -0.023277940534675953
Perplexity for validation set = 1.016265911

## Trigram language model using discounting smoothing

In [None]:
# Steps:
# 1. calculate unigram probabilities
# for each beta
#   2. estimate bigram discounted prob
#   3. trigram discount prob

In [None]:
# Reading text from given corpus

text = open('/content/en_wiki.txt').read()
text = text[:len(text)//700]
# tokenizing corpora and shuffling the list
sent_tokenize_list = sent_tokenize(text)
random.seed(4)
random.shuffle(sent_tokenize_list)
# sent_tokenize_list = ["My name is Umang.", "I am 21 years old."]

data = []
for sent in sent_tokenize_list:
  tokenized_sentence = ["START", "START"]
  words = word_tokenize(sent)
  for word in words:
    if word.isalnum():
      tokenized_sentence.append(word)
  if len(tokenized_sentence) > 2:
    tokenized_sentence.append("STOP")
    data.append(tokenized_sentence)

# print(data)
# dividing corpora into training and test sets

division_index = floor(0.9*len(data))
train_dev_set = data[0:division_index]
test_set = data[division_index:]

# print(len(data))
# print(data[:10])
# print(len(train_dev_set))
# print(train_dev_set[:10])
# print(len(test_set))
# print(test_set[:10])

In [None]:
def get_vocabulary(data, freq_threshold):

  vocabulary_dict = defaultdict(int)
  for sent in data:
    for word in sent:
      vocabulary_dict[word] += 1

  temp_vocab = defaultdict(int)
  for key, value in vocabulary_dict.items():
    if value <= freq_threshold:
      temp_vocab["UNKNOWN"] += 1
    else:
      temp_vocab[key] = value

  return list(temp_vocab.keys())

vocab = get_vocabulary(train_dev_set, 9)
# print(vocab)
# print(len(vocab))

In [None]:
def map_unknown_words(data, vocabulary):
  no_of_sentences = len(data)
  for row in range(no_of_sentences):
    no_of_words = len(data[row])
    for col in range(no_of_words):
      if data[row][col] not in vocabulary:
        data[row][col] = "UNKNOWN"

# tdata = [["START", "START", "my", "name", "is", "A", "STOP"], ["START", "START", "name", "am", "years", "is", "old", "STOP"]]
# map_unknown_words(tdata, vocab)
# print(tdata)

In [None]:
map_unknown_words(train_dev_set, vocab)
map_unknown_words(test_set, vocab)

In [None]:
def get_unigram_prob(count_unigram_train, no_of_words_train):
  prob_dict = defaultdict(int)
  unigrams = list(count_unigram_train.keys())
  for unigram in unigrams:
    prob_dict[unigram] = count_unigram_train[unigram]/no_of_words_train
  return prob_dict

In [None]:
def get_bigram_discounted_prob_beta(count_unigram_train, count_bigram_train, no_of_words_train, beta, unigram_prob):
  unigrams = list(count_unigram_train.keys())

  prob_dict = {}
  for v in unigrams:
    B = {}
    sigma = 0
    prob_sum = 0
    for w in unigrams:
      if count_bigram_train[(v,w)] > 0:
        prob_dict[(v,w)] = (count_bigram_train[(v,w)] - beta)/count_unigram_train[v]
        prob_sum += prob_dict[(v,w)]
      elif count_bigram_train[(v,w)] == 0:
        # B[(v,w)] = unigram_prob[w]
        sigma += unigram_prob[w]

    # sigma = sum(list(B.values()))
    # alpha = 1-sum(list(prob_dict.values()))
    alpha = 1 - prob_sum
    # print(sigma,alpha)
    for w in unigrams:
      if count_bigram_train[(v,w)] == 0:
        prob_dict[(v,w)] = alpha*(unigram_prob[w]/sigma)
  return prob_dict


In [None]:
def get_trigram_discounted_prob_beta(count_bigram_train, count_trigram_train, no_of_words_train, beta, bigram_prob, count_unigram_train):
  unigrams = list(count_unigram_train.keys())

  prob_dict = {}
  for u in unigrams:
    for v in unigrams:
      sigma = 0
      prob_sum = 0

      for w in unigrams:
        if count_trigram_train[(u,v,w)] > 0:
          prob_dict[(u,v,w)] = (count_trigram_train[(u,v,w)]-beta)/count_bigram_train[(u,v)]
          prob_sum += prob_dict[(u,v,w)]
        elif count_trigram_train[(u,v,w)] == 0:
          sigma += bigram_prob[(v,w)]

      alpha = 1-prob_sum

      for w in unigrams:
        if count_trigram_train[(u,v,w)] == 0:
          prob_dict[(u,v,w)] = alpha*(bigram_prob[(v,w)]/sigma)
  
  return prob_dict

In [None]:
def get_bigram_discounted_prob(count_unigram_train, count_bigram_train, no_of_words_train, unigram_prob, count_bigram_dev):

  best_beta = 0
  maxL = float('-inf')
  bigram_discounted_prob = {}
  step = 0.1
  for beta in np.arange(0.1,1,step):
    L = 0
    cur_bigram_prob = get_bigram_discounted_prob_beta(count_unigram_train, count_bigram_train, no_of_words_train, beta, unigram_prob)
    bigram_list = list(count_bigram_dev.keys())
    for bigram in bigram_list:
      count = count_bigram_dev[bigram]
      if cur_bigram_prob[bigram] !=0 :
        L += count*np.log2(cur_bigram_prob[bigram])
    if L > maxL:
      maxL = L
      best_beta = beta
      bigram_discounted_prob = cur_bigram_prob

  return beta, bigram_discounted_prob

In [None]:
def get_trigram_discounted_prob(count_bigram_train, count_trigram_train, no_of_words_train, bigram_prob, count_trigram_dev, count_unigram_train, no_of_words_dev):

  best_beta = 0
  maxL = float('-inf')
  trigram_discounted_prob = {}
  step = 0.1
  for beta in np.arange(0.1,1,step):
    L = 0
    cur_trigram_prob = get_trigram_discounted_prob_beta(count_bigram_train, count_trigram_train, no_of_words_train, beta, bigram_prob, count_unigram_train)
    trigram_list = list(count_trigram_dev.keys())
    for trigram in trigram_list:
      count = count_trigram_dev[trigram]
      if cur_trigram_prob[trigram] !=0 :
        L += count*np.log2(cur_trigram_prob[trigram])
    if L > maxL:
      maxL = L
      best_beta = beta
      trigram_discounted_prob = cur_trigram_prob

  return beta, trigram_discounted_prob, maxL/no_of_words_dev

In [None]:
def calculate_perplexity_discounting(count_trigram_test, trigram_prob, no_of_words_test):

  trigrams = list(count_trigram_test.keys())
  L = 0
  for trigram in trigrams:
    count = count_trigram_test[trigram]
    if trigram_prob[trigram] !=0 :
      L += count*np.log2(trigram_prob[trigram])

  L = L/no_of_words_test
  return pow(2,-1*L), L

In [None]:
for i in range(5):

  data = copy.deepcopy(train_dev_set)
  random.shuffle(data)

  division_index = floor(0.9*len(data))
  train_set = data[:division_index]
  dev_set = data[division_index:]
  test_set_copy = copy.deepcopy(test_set)

  trigram_list_train = []
  bigram_list_train = []
  unigram_list_train = []

  for sent in train_set:
    unigram_list_train.extend(sent)
    bigram_list_train.extend(list(nltk.bigrams(sent)))
    trigram_list_train.extend(list(nltk.trigrams(sent)))

  count_trigram_train = defaultdict(int,Counter(trigram_list_train))
  count_bigram_train = defaultdict(int,Counter(bigram_list_train))
  count_unigram_train = defaultdict(int,Counter(unigram_list_train))

  trigram_list_dev = []
  bigram_list_dev = []

  for sent in dev_set:
    bigram_list_dev.extend(list(nltk.bigrams(sent)))
    trigram_list_dev.extend(list(nltk.trigrams(sent)))

  count_trigram_dev = defaultdict(int,Counter(trigram_list_dev))
  count_bigram_dev = defaultdict(int,Counter(bigram_list_dev))

  trigram_list_test = []

  for sent in test_set_copy:
    trigram_list_test.extend(list(nltk.trigrams(sent)))
  
  count_trigram_test = defaultdict(int,Counter(trigram_list_test))

  no_of_words_train = 0
  for sent in train_set:
    no_of_words_train += len(sent)
  
  no_of_words_dev = 0
  for sent in dev_set:
    no_of_words_dev += len(sent)

  no_of_words_test = 0
  for sent in test_set:
    no_of_words_test += len(sent)

  unigram_prob = get_unigram_prob(count_unigram_train, no_of_words_train)
  beta_bigram, bigram_prob = get_bigram_discounted_prob(count_unigram_train, count_bigram_train, no_of_words_train, unigram_prob, count_bigram_dev)
  # print(sum(list(bigram_prob.values())))
  beta_trigram, trigram_prob, log_likelihood_dev = get_trigram_discounted_prob(count_bigram_train, count_trigram_train, no_of_words_train, bigram_prob, count_trigram_dev, count_unigram_train, no_of_words_dev)
  # print(sum(list(trigram_prob.values())))
  perplexity_test, log_likelihood_test = calculate_perplexity_discounting(count_trigram_test, trigram_prob, no_of_words_test)
  print('Iteration Number: ', i+1)
  print('(beta(bigram), beta(trigram)) = (',round(beta_bigram,2),',',round(beta_trigram,2),')')
  print('Log-likelihood for validation set =', log_likelihood_dev)
  print('Perplexity for validation set =', pow(2,-1*log_likelihood_dev))
  print('Log-likelihood for test set =', log_likelihood_test)
  print('Perplexity for test set =', perplexity_test)
  print('')

  del data, train_set, dev_set, test_set_copy, trigram_list_train, bigram_list_train, unigram_list_train, count_trigram_train, count_bigram_train, count_unigram_train
  del trigram_list_dev, bigram_list_dev, count_trigram_dev, count_bigram_dev, trigram_list_test, count_trigram_test, no_of_words_train, no_of_words_dev, no_of_words_test
  del unigram_prob, beta_bigram, bigram_prob, beta_trigram, trigram_prob, log_likelihood_dev, perplexity_test, log_likelihood_test

Iteration Number:  1
(beta(bigram), beta(trigram)) = ( 0.9 , 0.9 )
Log-likelihood for validation set = -3.92736746456683
Perplexity for validation set = 15.214420345992108
Log-likelihood for test set = -3.748243127829662
Perplexity for test set = 13.437968307997425

Iteration Number:  2
(beta(bigram), beta(trigram)) = ( 0.9 , 0.9 )
Log-likelihood for validation set = -3.8622204405850886
Perplexity for validation set = 14.542671781238463
Log-likelihood for test set = -3.75942235417423
Perplexity for test set = 13.542501581493012

Iteration Number:  3
(beta(bigram), beta(trigram)) = ( 0.9 , 0.9 )
Log-likelihood for validation set = -3.8761368011263206
Perplexity for validation set = 14.683630401481283
Log-likelihood for test set = -3.7526452469163925
Perplexity for test set = 13.479034422520657

Iteration Number:  4
(beta(bigram), beta(trigram)) = ( 0.9 , 0.9 )
Log-likelihood for validation set = -3.9186065207751137
Perplexity for validation set = 15.122308861883775
Log-likelihood for te

## Trigram language model using laplace smoothing

In [None]:
# Steps:
# 1. calculate counts on train set
# 2. calculate probabilities
# 3. calculate likelihood

In [None]:
# Reading text from given corpus

text = open('/content/en_wiki.txt').read()

# tokenizing corpora and shuffling the list
sent_tokenize_list = sent_tokenize(text)
sent_tokenize_list = sent_tokenize_list[0:1200]
random.seed(4)
random.shuffle(sent_tokenize_list)
# sent_tokenize_list = ["My name is Umang.", "I am 21 years old."]

data = []
for sent in sent_tokenize_list:
  tokenized_sentence = ["START", "START"]
  words = word_tokenize(sent)
  for word in words:
    if word.isalnum():
      tokenized_sentence.append(word)
  if len(tokenized_sentence) > 2:
    tokenized_sentence.append("STOP")
    data.append(tokenized_sentence)

# print(data)
# dividing corpora into training and test sets

division_index = floor(0.9*len(data))
train_dev_set = data[0:division_index]
test_set = data[division_index:]

# print(len(data))
# print(data[:10])
# print(len(train_dev_set))
# print(train_dev_set[:10])
# print(len(test_set))
# print(test_set[:10])

In [None]:
def get_vocabulary(data, freq_threshold):

  vocabulary_dict = defaultdict(int)
  for sent in data:
    for word in sent:
      vocabulary_dict[word] += 1

  temp_vocab = defaultdict(int)
  for key, value in vocabulary_dict.items():
    if value <= freq_threshold:
      temp_vocab["UNKNOWN"] += 1
    else:
      temp_vocab[key] = value

  return list(temp_vocab.keys())

vocab = get_vocabulary(train_dev_set, 9)
# print(vocab)
# print(len(vocab))

In [None]:
def map_unknown_words(data, vocabulary):
  no_of_sentences = len(data)
  for row in range(no_of_sentences):
    no_of_words = len(data[row])
    for col in range(no_of_words):
      if data[row][col] not in vocabulary:
        data[row][col] = "UNKNOWN"

map_unknown_words(train_dev_set, vocab)
map_unknown_words(test_set, vocab)

In [None]:
def get_trigram_probabilities(count_unigram_train, count_bigram_train, count_trigram_train, V):
  prob_dict = defaultdict(int)
  unigrams = list(count_unigram_train.keys())

  for u in unigrams:
    for v in unigrams:
      for w in unigrams:
        prob_dict[(u,v,w)] = (count_trigram_train[(u,v,w)] + 1)/(count_bigram_train[(u,v)] + V)
  return prob_dict

In [None]:
def compute_perplexity(count_trigram, prob_trigram, M):

  trigrams = list(count_trigram.keys())
  L=0
  for trigram in trigrams:
    if prob_trigram[trigram]!=0:
      L += count_trigram[trigram]*np.log2(prob_trigram[trigram])

  L = L/M
  return pow(2,-1*L), L

In [None]:
for i in range(5):

  data = copy.deepcopy(train_dev_set)
  random.shuffle(data)

  division_index = floor(0.9*len(data))
  train_set = data[:division_index]
  dev_set = data[division_index:]
  test_set_copy = copy.deepcopy(test_set)

  trigram_list_train = []
  bigram_list_train = []
  unigram_list_train = []
  
  for sent in train_set:
    unigram_list_train.extend(sent)
    bigram_list_train.extend(list(nltk.bigrams(sent)))
    trigram_list_train.extend(list(nltk.trigrams(sent)))

  count_trigram_train = defaultdict(int,Counter(trigram_list_train))
  count_bigram_train = defaultdict(int,Counter(bigram_list_train))
  count_unigram_train = defaultdict(int,Counter(unigram_list_train))

  trigram_list_dev = []
  for sent in dev_set:
    trigram_list_dev.extend(list(nltk.trigrams(sent)))
  count_trigram_dev = defaultdict(int,Counter(trigram_list_dev))

  trigram_list_test = []
  for sent in test_set_copy:
    trigram_list_test.extend(list(nltk.trigrams(sent)))
  count_trigram_test = defaultdict(int,Counter(trigram_list_test))

  no_of_words_dev = 0
  for sent in dev_set:
    no_of_words_dev += len(sent)

  no_of_words_test = 0
  for sent in test_set:
    no_of_words_test += len(sent)

  trigram_prob_laplace = get_trigram_probabilities(count_unigram_train, count_bigram_train, count_trigram_train,  len(vocab))
  perplexity_dev, log_likelihood_dev = compute_perplexity(count_trigram_dev, trigram_prob_laplace, no_of_words_dev)
  perplexity_test, log_likelihood_test = compute_perplexity(count_trigram_test, trigram_prob_laplace, no_of_words_test)

  print('Iteration Number: ', i+1)
  print('Log-likelihood for validation set =', log_likelihood_dev)
  print('Perplexity for validation set =', perplexity_dev)
  print('Log-likelihood for test set =', log_likelihood_test)
  print('Perplexity for test set =', perplexity_test)
  print('')

  del data, train_set, dev_set, test_set_copy, trigram_list_train, bigram_list_train, unigram_list_train, count_trigram_train, count_bigram_train, count_unigram_train
  del trigram_list_dev, count_trigram_dev, trigram_list_test, count_trigram_test, no_of_words_dev, no_of_words_test, trigram_prob_laplace, perplexity_dev, log_likelihood_dev, perplexity_test, log_likelihood_test

Iteration Number:  1
Log-likelihood for validation set = -5.301196849844386
Perplexity for validation set = 39.42931795259473
Log-likelihood for test set = -5.241714391812473
Perplexity for test set = 37.83670079104829

Iteration Number:  2
Log-likelihood for validation set = -5.500050513809676
Perplexity for validation set = 45.25641855402469
Log-likelihood for test set = -5.247508919888655
Perplexity for test set = 37.9889760155538

Iteration Number:  3
Log-likelihood for validation set = -5.2845101565410095
Perplexity for validation set = 38.975892565117846
Log-likelihood for test set = -5.247515208829299
Perplexity for test set = 37.9891416159955

Iteration Number:  4
Log-likelihood for validation set = -5.33363936343917
Perplexity for validation set = 40.326026803713866
Log-likelihood for test set = -5.249774147794518
Perplexity for test set = 38.048670739337226

Iteration Number:  5
Log-likelihood for validation set = -5.301248277034215
Perplexity for validation set = 39.43072349