## Imports

In [None]:
import nltk
import math
import random
import copy
from math import log
from pprint import pprint
from nltk.corpus import gutenberg, stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
from nltk.util import ngrams

nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

print(gutenberg.fileids())
moby_dick = gutenberg.raw('melville-moby_dick.txt')
print('Moby Dick sample')
print('====================')
print(len(moby_dick))

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Moby Dick sample
1242990


**We split the dataset 80% training, 10% validation and 10% test(based on the total number of sentences in the corpus).**

In [None]:
sentences = sent_tokenize(moby_dick)
print("There are", len(sentences), "sentences:")
i=0
for sent in sentences:
  if i==2: break
  print(sent)
  print("_________________")
  i += 1

There are 9852 sentences:
[Moby Dick by Herman Melville 1851]


ETYMOLOGY.
_________________
(Supplied by a Late Consumptive Usher to a Grammar School)

The pale Usher--threadbare in coat, heart, body, and brain; I see him
now.
_________________


In [None]:
split=int(len(sentences)*10/100)

train_sentences = sentences[:len(sentences)-2*split]
validation_sentences = sentences[len(sentences)-2*split:len(sentences)-split]
test_sentences = sentences[len(sentences)-split:]

## Vocabulary

In [None]:
tokens = []
for sent in train_sentences:
  tokens += word_tokenize(sent)
  
count = nltk.FreqDist(tokens)
vocabulary = [w for w in count if count[w]>=10]

## Tokenization

In [None]:
train_sentences_tokenized = []
validation_sentences_tokenized = []
test_sentences_tokenized = []

for sent in train_sentences:
  sent_tok = word_tokenize(sent)
  train_sentences_tokenized.append(sent_tok)

for sent in validation_sentences:
  sent_tok = word_tokenize(sent)
  validation_sentences_tokenized.append(sent_tok)

for sent in test_sentences:
  sent_tok = word_tokenize(sent)
  test_sentences_tokenized.append(sent_tok)

## Replace OOV words from the training, development and test subsets

In [None]:
for sent in range(len(train_sentences_tokenized)):
  for word in range(len(train_sentences_tokenized[sent])):
    if train_sentences_tokenized[sent][word] not in vocabulary:
       train_sentences_tokenized[sent][word] ='UNK'
    else:
       train_sentences_tokenized[sent][word]=train_sentences_tokenized[sent][word].lower()

for sent in range(len(validation_sentences_tokenized)):
  for word in range(len(validation_sentences_tokenized[sent])):
    if validation_sentences_tokenized[sent][word] not in vocabulary: 
      validation_sentences_tokenized[sent][word] ='UNK'
    else:
      validation_sentences_tokenized[sent][word]=validation_sentences_tokenized[sent][word].lower()

for sent in range(len(test_sentences_tokenized)):
  for word in range(len(test_sentences_tokenized[sent])):
    if test_sentences_tokenized[sent][word] not in vocabulary: 
      test_sentences_tokenized[sent][word] ='UNK'
    else:
      test_sentences_tokenized[sent][word]=test_sentences_tokenized[sent][word].lower()


## Create and count n-grams frequency

In [None]:
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in train_sentences_tokenized:
    
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<start>',right_pad_symbol='<end>') ])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<start>',right_pad_symbol='<end>') ])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<start>',right_pad_symbol='<end>') ])


unigram_counter[('<start>',)]=len(train_sentences_tokenized)
bigram_counter[('<start>','<start>')]=len(train_sentences_tokenized)

#pprint(unigram_counter)
#pprint(bigram_counter)
#pprint(trigram_counter)

## bi-gram language model

In [None]:
test_sentences_tokenized_bigram = copy.deepcopy(test_sentences_tokenized)
for sent in test_sentences_tokenized_bigram:
  sent.append('<end>')
  sent.insert(0,'<start>')

validation_sentences_tokenized_bigram = copy.deepcopy(validation_sentences_tokenized)
for sent in validation_sentences_tokenized_bigram:
  sent.append('<end>')
  sent.insert(0,'<start>')

In [None]:
alpha = 0.01
vocab_size = len(vocabulary) 

def bigram_prediction(sent,set='test'):
  bigram_log_prob=0.0
  bigram_log_prob_random=0.0
  if set=='test':
    count = len(test_sentences_tokenized_bigram[sent])
    w=[]
    w += [random.choice(vocabulary) for _ in range(count-2)]
    w.append('<end>')
    w.insert(0,'<start>')
    for word in range(count):

      if word==count-1:
        break
      bigram_prob = (bigram_counter[(test_sentences_tokenized_bigram[sent][word], test_sentences_tokenized_bigram[sent][word+1])] +alpha) / (unigram_counter[(test_sentences_tokenized_bigram[sent][word],)] + alpha*vocab_size)
      bigram_prob_random = (bigram_counter[(w[word], w[word+1])] + alpha) / (unigram_counter[(w[word],)] + alpha*vocab_size)
      bigram_log_prob += math.log2(bigram_prob)
      bigram_log_prob_random += math.log2(bigram_prob_random)
    
    return(bigram_log_prob,bigram_log_prob_random)
  elif set =='validation':
    count = len(validation_sentences_tokenized_bigram[sent])
    w=[]
    w += [random.choice(vocabulary) for _ in range(count-2)]
    w.append('<end>')
    w.insert(0,'<start>')
    for word in range(count):

      if word==count-1:
        break
      bigram_prob = (bigram_counter[(validation_sentences_tokenized_bigram[sent][word], validation_sentences_tokenized_bigram[sent][word+1])] +alpha) / (unigram_counter[(validation_sentences_tokenized_bigram[sent][word],)] + alpha*vocab_size)
      bigram_prob_random = (bigram_counter[(w[word], w[word+1])] + alpha) / (unigram_counter[(w[word],)] + alpha*vocab_size)
      bigram_log_prob += math.log2(bigram_prob)
      bigram_log_prob_random += math.log2(bigram_prob_random)
    
    return(bigram_log_prob,bigram_log_prob_random)

##  tri-gram language model

In [None]:
test_sentences_tokenized_trigram = copy.deepcopy(test_sentences_tokenized_bigram)
for sent in test_sentences_tokenized_trigram:
  sent.append('<end>')
  sent.insert(0,'<start>')

validation_sentences_tokenized_trigram = copy.deepcopy(validation_sentences_tokenized_bigram)
for sent in validation_sentences_tokenized_trigram:
  sent.append('<end>')
  sent.insert(0,'<start>')

In [None]:
def trigram_prediction(sent,set='test'):
  trigram_log_prob=0.0
  trigram_log_prob_random=0.0
  if set=='test':
    count = len(test_sentences_tokenized_trigram[sent])
    w=[]
    w += [random.choice(vocabulary) for _ in range(count-4)]
    w.append('<end>')
    w.append('<end>')
    w.insert(0,'<start>')
    w.insert(0,'<start>')
    for word in range(count):

      if word==count-2:
        break
      trigram_prob = (trigram_counter[(test_sentences_tokenized_trigram[sent][word], test_sentences_tokenized_trigram[sent][word+1],test_sentences_tokenized_trigram[sent][word+2])] +alpha) / (bigram_counter[(test_sentences_tokenized_trigram[sent][word],test_sentences_tokenized_trigram[sent][word+1])] + alpha*vocab_size)
      trigram_prob_random = (trigram_counter[(w[word], w[word+1],w[word+2])] + alpha) / (bigram_counter[(w[word],w[word+1])] + alpha*vocab_size)
      trigram_log_prob += math.log2(trigram_prob)
      trigram_log_prob_random += math.log2(trigram_prob_random)
    
    return(trigram_log_prob,trigram_log_prob_random)
  elif set=='validation':
    count = len(validation_sentences_tokenized_trigram[sent])
    w=[]
    w += [random.choice(vocabulary) for _ in range(count-4)]
    w.append('<end>')
    w.append('<end>')
    w.insert(0,'<start>')
    w.insert(0,'<start>')
    for word in range(count):

      if word==count-2:
        break
      trigram_prob = (trigram_counter[(validation_sentences_tokenized_trigram[sent][word], validation_sentences_tokenized_trigram[sent][word+1],validation_sentences_tokenized_trigram[sent][word+2])] +alpha) / (bigram_counter[(validation_sentences_tokenized_trigram[sent][word],validation_sentences_tokenized_trigram[sent][word+1])] + alpha*vocab_size)
      trigram_prob_random = (trigram_counter[(w[word], w[word+1],w[word+2])] + alpha) / (bigram_counter[(w[word],w[word+1])] + alpha*vocab_size)
      trigram_log_prob += math.log2(trigram_prob)
      trigram_log_prob_random += math.log2(trigram_prob_random)
    
    return(trigram_log_prob,trigram_log_prob_random)

In [None]:
print('--------Bigram Language Model----------')
print(bigram_prediction(40,'test'))
print('--------Trigram Language Model---------')
print(trigram_prediction(40,'test'))

--------Bigram Language Model----------
(-40.37967070917181, -120.95731863202468)
--------Trigram Language Model---------
(-38.05871889223146, -127.9520478108962)


## CROSS-ENTROPY

In [None]:
def cross_entropy(LanguageModel='bigram',set='test'):
  sum = 0
  N = 0
  if set=='test':
    if LanguageModel=='bigram':
      count = len(test_sentences_tokenized_bigram)
      for sent in range(count):
        a,b = bigram_prediction(sent)
        N += len(test_sentences_tokenized_bigram[sent])
        sum+=a
      return -sum/(N-count)
    elif LanguageModel=='trigram':
      count = len(test_sentences_tokenized_trigram)
      for sent in range(count):
        a,b = trigram_prediction(sent)
        N += len(test_sentences_tokenized_trigram[sent])
        sum+=a
      return -sum/(N-2*count)
  elif set=='validation':
      if LanguageModel=='bigram':
        count = len(validation_sentences_tokenized_bigram)
        for sent in range(count):
          a,b = bigram_prediction(sent,'validation')
          N += len(validation_sentences_tokenized_bigram[sent])
          sum+=a
        return -sum/(N-count)
      elif LanguageModel=='trigram':
        count = len(validation_sentences_tokenized_trigram)
        for sent in range(count):
          a,b = trigram_prediction(sent,'validation')
          N += len(validation_sentences_tokenized_trigram[sent])
          sum+=a
        return -sum/(N-2*count)

In [None]:
print('------Cross-Entropy for bigram Language Model ----------------')
print(cross_entropy('bigram','test'))
print('------Cross-Entropy for trigram Language Model ---------------')
print(cross_entropy('trigram','test'))


------Cross-Entropy for bigram Language Model ----------------
6.252162155091329
------Cross-Entropy for trigram Language Model ---------------
7.295378398784865


## PERPLEXITY

In [None]:
def perplexity(LanguageModel='bigram',set='test'):
  if LanguageModel=='bigram':
    return 2**cross_entropy('bigram',set)
  elif LanguageModel=='trigram':
    return 2**cross_entropy('trigram',set)

In [None]:
print('------Perplexity for bigram Language Model ----------------')
print(perplexity())
print('------Perplexity for trigram Language Model ----------------')
print(perplexity('trigram'))

------Perplexity for bigram Language Model ----------------
76.2234051862263
------Perplexity for trigram Language Model ----------------
157.08247222709485


## LINEAR INTERPOLATION

In [None]:
def LinearInterpolation(Estimation='cross-entropy',l1=0.5,set='validation'):
  score=0.0
  if Estimation=='cross-entropy':
    score = l1*cross_entropy('bigram',set)+(1-l1)*cross_entropy('trigram',set)
  elif Estimation=='perplexity':
    score = l1*perplexity('bigram',set)+(1-l1)*perplexity('trigram',set)
  return score

## Training to find best L1 parameter

In [None]:
min = 100
L1 = 0.0
for _ in range(500):

  random_value= random.uniform(0, 1)
  score=LinearInterpolation('cross-entropy',random_value)

  if score<min:
    min = score
    L1=random_value
    print('------')

print('---------- Scores on validation set---------')
print('Linear Interpolation: {}\nL1 value: {}'.format(score,L1))

------
------
------
------
---------- Scores on validation set---------
Linear Interpolation: 6.818206021462835
L1 value: 0.998039095341101


In [None]:
print('------Linear Interpolation in terms of Cross-Entropy on the test set ----------------')
print(LinearInterpolation('cross-entropy',L1,'test'))
print('------Linear Interpolation in terms of Perplexity on the test set -------------------')
print(LinearInterpolation('perplexity',L1,'test'))

------Linear Interpolation in terms of Cross-Entropy on the test set ----------------
6.254207802683827
------Linear Interpolation in terms of Perplexity on the test set -------------------
76.38196210750097


In [None]:
#exit()