In [141]:
#### Import packages and set working directory

import pandas as pd
import os
from google.colab import drive
import random
import re
import math
import numpy as np

drive.mount('/content/drive/')
os.getcwd()
os.chdir('/content/drive/My Drive/cse517/hw1')


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
#### Import data 

train = pd.read_csv('brown.train.txt',delimiter='\t',encoding='utf-8', header = None)
valid = pd.read_csv('brown.dev.txt',delimiter='\t',encoding='utf-8', header = None)
test = pd.read_csv('brown.test.txt',delimiter='\t',encoding='utf-8', header = None)
## Extract only 50% of the entire dataset for exploring purpose
#random.seed(517)
#train = train.sample(frac=0.5, replace =False)
train.columns =['text']
valid.columns =['text']
test.columns =['text']

#### Adding start symbols and end symbols
SENTENCE_START_1 = "<s>"
SENTENCE_START_2 = "<ss>"
SENTENCE_END = "</s>"

for i in train.index:
  train['text'][i] = SENTENCE_START_1 + ' ' + SENTENCE_START_2 + ' ' + train['text'][i] + ' ' + SENTENCE_END
  
for i in valid.index:
  valid['text'][i] = SENTENCE_START_1 + ' ' + SENTENCE_START_2 + ' ' + valid['text'][i] + ' ' + SENTENCE_END
  
for i in test.index:
  test['text'][i] = SENTENCE_START_1 + ' ' + SENTENCE_START_2 + ' ' + test['text'][i] + ' ' + SENTENCE_END

In [0]:
#### Finding low frequent words in the training set and replace them as '<unk>'
train = pd.DataFrame({'text' : train['text']})
train_cat = train['text'].str.cat(sep=' \n ')
#### low_freq can be altered 
low_freq = 1
from collections import Counter
counts = Counter(train_cat.split(' '))
train_cat_replaced = ' '.join(i if counts[i] > low_freq else '<unk>' for i in train_cat.split(' '))
train_text = train_cat_replaced.split(' \n ')

In [144]:
##############################################################
######################## Unigram #############################
##############################################################
#### Build unigram dictionary for the training text
unigram_frequencies = dict()
corpus_length = 0
for sentence in train_text:
  for word in sentence.split():
      unigram_frequencies[word] = unigram_frequencies.get(word, 0) + 1
      if word != SENTENCE_START_1 and word != SENTENCE_START_2:
          corpus_length += 1
unigram_frequencies['\n'] = 0

#### Handle the out of vocabuary case for the dev set and test set
valid = pd.DataFrame({'text' : valid['text']})
valid_cat = valid['text'].str.cat(sep=' \n ')
valid_cat_replaced = ' '.join(i if i in unigram_frequencies.keys()  else '<unk>' for i in valid_cat.split(' '))
valid_text = valid_cat_replaced.split(' \n ')

test = pd.DataFrame({'text' : test['text']})
test_cat = test['text'].str.cat(sep=' \n ')
test_cat_replaced = ' '.join(i if i in unigram_frequencies.keys()  else '<unk>' for i in test_cat.split(' '))
test_text = test_cat_replaced.split(' \n ')

unigram_frequencies.pop('\n', None)

0

In [0]:
#### Calculate unigram probabilty for a specific word
def calculate_unigram_probability(word):
    word_probability_numerator = unigram_frequencies.get(word, 0)
    word_probability_denominator = corpus_length
    return float(word_probability_numerator) / float(word_probability_denominator)
  
#### Calculate a specific sentence probabilty by producting the probabilty for each word  
def calculate_sentence_probability(sentence):
    sentence_probability_log_sum = 0
    for word in sentence.split():
        if word != SENTENCE_START_1 and word != SENTENCE_START_2:
          word_probability = calculate_unigram_probability(word)
          sentence_probability_log_sum += math.log(word_probability, 2)
    return(sentence_probability_log_sum) 
  
#### Calculate the corpus for the data set 
def calculate_corpus(data):
  corpus = 0 
  for sentence in data:
    for word in sentence.split():
        if word != SENTENCE_START_1 and word != SENTENCE_START_2:
            corpus += 1
  return(corpus)
# sum(unigram_frequencies.values()) - unigram_frequencies['<s>'] - unigram_frequencies['<ss>']

#### Calculate the perplexity for the data set using the unigram model
def calculate_perplexity(data):
  log_loss = 0
  corpus = calculate_corpus(data)
  for sentence in data:
    log_loss += calculate_sentence_probability(sentence)
  log_loss = log_loss/corpus
  return(math.pow(2, -log_loss))


In [146]:
print("Unigram Model")
print("The perplexity for the training set is %f"%(calculate_perplexity(train_text)))
print("The perplexity for the dev set is %f"%(calculate_perplexity(valid_text)))
print("The perplexity for the test set is %f"%(calculate_perplexity(test_text)))


Unigram Model
The perplexity for the training set is 868.099627
The perplexity for the dev set is 765.903877
The perplexity for the test set is 769.370394


In [0]:
##############################################################
######################## Bigram #############################
##############################################################
#### Build bigram dictionary for the training text
bigram_frequencies = dict()
unique_bigrams = set()
corpus_length = 0
for sentence in train_text:
  previous_word = None
  for word in sentence.split():
    if previous_word != None and previous_word != "</s>":
      bigram_frequencies[(previous_word, word)] = bigram_frequencies.get((previous_word, word), 0 )+1
      #unigram_frequencies[word] = unigram_frequencies.get(word, 0) + 1
      if previous_word != SENTENCE_START_2 and word != SENTENCE_END:
          unique_bigrams.add((previous_word, word))
    previous_word = word 
    
    

In [0]:
#### Calculate bigram probabilty for a specific bigram word
def calculate_bigram_probability(previous_word, word):
  bigram_word_numerator = bigram_frequencies.get((previous_word, word),0)
  bigram_word_denominator = unigram_frequencies.get(previous_word)
  return(bigram_word_numerator/bigram_word_denominator)

#calculate_bigram_probability('an', '<unk>')

#### Calculate a specific sentence probabilty by using the bigram probability
def calculate_bigram_sentence_probability(sentence):
  bigram_sentence_probability_log_sum = 0
  previous_word = None
  for word in sentence.split():
    #print(previous_word, word)
    if previous_word !=None:
      bigram_word_probability = calculate_bigram_probability(previous_word, word)
      if bigram_word_probability ==0.0:
        return(-math.inf)
      else:
        bigram_sentence_probability_log_sum += math.log(bigram_word_probability, 2)
    previous_word = word
  return(bigram_sentence_probability_log_sum)

#### Calculate the perplexity for the data set using the bigram model
def bigram_calculate_perplexity(data):
  bigram_log_loss = 0
  corpus = calculate_corpus(data)
  for sentence in data:
    bigram_log_loss += calculate_bigram_sentence_probability(sentence)
  bigram_log_loss = bigram_log_loss/corpus
  return(math.pow(2, -bigram_log_loss))

In [151]:
print("Bigram Model")
print("The perplexity for the training set is %f"%(bigram_calculate_perplexity(train_text)))
print("The perplexity for the dev set is %f"%(bigram_calculate_perplexity(valid_text)))
print("The perplexity for the test set is %f"%(bigram_calculate_perplexity(test_text)))

Bigram Model
The perplexity for the training set is 65.018176
The perplexity for the dev set is inf
The perplexity for the test set is inf


In [0]:
######################################################################
######################## Trigram(Section II) #########################
######################################################################
#### Build trigram dictionary for the training set
trigram_frequencies = dict()
unique_trigrams = set()
#corpus_length = 0
for sentence in train_text:
  word_1 = None
  word_2 = None
  for word in sentence.split():
    if word_1 != None and word_2 != None and word_2 != "</s>":
      trigram_frequencies[(word_1, word_2, word)] = trigram_frequencies.get((word_1, word_2, word), 0 ) + 1
      if word_1 != SENTENCE_START_1 and word_2 != SENTENCE_START_2 and word != SENTENCE_END:
          unique_trigrams.add((word_1, word_2, word))
    word_1 = word_2
    word_2 = word


In [0]:
#### Calculate trigram probabilty for a specific trigram word
def calculate_trigram_probability(word_1, word_2, word):
  trigram_word_numerator = trigram_frequencies.get((word_1, word_2, word),0)
  trigram_word_denominator = bigram_frequencies.get((word_1, word_2),0)
  return(trigram_word_numerator/trigram_word_denominator)

#### Calculate a specific sentence probabilty by using the trigram probability
def calculate_trigram_sentence_probability(sentence):
  trigram_sentence_probability_log_sum = 0
  word_1 = None
  word_2 = None
  for word in sentence.split():
    #print(previous_word, word)
    if word_1 !=None and word_2 != None:
      trigram_word_probability = calculate_trigram_probability(word_1, word_2, word)
      if trigram_word_probability ==0.0:
        return(-math.inf)
      else:
        trigram_sentence_probability_log_sum += math.log(trigram_word_probability, 2)
    word_1 = word_2
    word_2 = word
  return(trigram_sentence_probability_log_sum)

#### Calculate the perplexity for the data set using the trigram model
def trigram_calculate_perplexity(data):
  trigram_log_loss = 0
  corpus = calculate_corpus(data)
  for sentence in data:
    trigram_log_loss += calculate_trigram_sentence_probability(sentence)
  trigram_log_loss = trigram_log_loss/corpus
  return(math.pow(2, -trigram_log_loss))

In [155]:
print("Trigram Model")
print("The perplexity for the training set is %f"%(trigram_calculate_perplexity(train_text)))
print("The perplexity for the dev set is %f"%(trigram_calculate_perplexity(valid_text)))
print("The perplexity for the test set is %f"%(trigram_calculate_perplexity(test_text)))

Trigram Model
The perplexity for the training set is 7.027090
The perplexity for the dev set is inf
The perplexity for the test set is inf


In [0]:
#######################################################################################
######################## Trigram(Section III) add K smoothing #########################
#######################################################################################
unique_unigram_length = len(unigram_frequencies.keys()) -2
unique_bigram_length = len(bigram_frequencies.keys()) -1
unique_trigram_length = len(trigram_frequencies.keys()) 

#### Add K trigram probablity 
def calculate_trigram_probability_addK(word_1, word_2, word, K):
    trigram_word_numerator = trigram_frequencies.get((word_1, word_2, word),0) + K
    trigram_word_denominator = bigram_frequencies.get((word_1, word_2),0) + K*unique_unigram_length
    return(trigram_word_numerator/trigram_word_denominator)

##### Sentence probability with add K smoothing 
def calculate_trigram_sentence_probability_addK(sentence, K):
  trigram_sentence_probability_log_sum = 0
  word_1 = None
  word_2 = None
  for word in sentence.split():
    #print(previous_word, word)
    if word_1 !=None and word_2 != None:
      trigram_word_probability = calculate_trigram_probability_addK(word_1, word_2, word, K)
      trigram_sentence_probability_log_sum += math.log(trigram_word_probability, 2)
    word_1 = word_2
    word_2 = word
  return(trigram_sentence_probability_log_sum)

#### Trigram perplexity with add K smoothing  
def trigram_calculate_perplexity_addK(data, K):
  trigram_log_loss = 0
  corpus = calculate_corpus(data)
  for sentence in data:
    trigram_log_loss += calculate_trigram_sentence_probability_addK(sentence, K)
  trigram_log_loss = trigram_log_loss/corpus
  return(math.pow(2, -trigram_log_loss))

In [158]:
K_list = [0.001, 0.01, 0.1, 1]
res_k = np.zeros((len(K_list), 3))
for k in range(len(K_list)):
  print("Add %f smoothing trigram model" %(K_list[k]))
  print("training set, Perplexity:%f" %( trigram_calculate_perplexity_addK(train_text, K_list[k])))
  print("develop set, Perplexity:%f" %( trigram_calculate_perplexity_addK(valid_text, K_list[k])))
  #print("test set, Perplexity:%f" %( trigram_calculate_perplexity(test_text, K_list[k])))
  res_k[k, 0] = trigram_calculate_perplexity_addK(train_text, K_list[k])
  res_k[k, 1] = trigram_calculate_perplexity_addK(valid_text, K_list[k])
  res_k[k, 2] = trigram_calculate_perplexity_addK(test_text, K_list[k])


Add 0.001000 smoothing trigram model
training set, Perplexity:35.974900
develop set, Perplexity:2832.742572
Add 0.010000 smoothing trigram model
training set, Perplexity:183.351504
develop set, Perplexity:3421.921512
Add 0.100000 smoothing trigram model
training set, Perplexity:1224.627900
develop set, Perplexity:5654.727267
Add 1.000000 smoothing trigram model
training set, Perplexity:6769.136077
develop set, Perplexity:10578.830463


In [164]:
unigram_corpus

1064606

In [0]:
############################################################################################
######################## Trigram(Section III) Linear Interpolation #########################
############################################################################################

def calculate_trigram_probability_LP(word_1, word_2, word):
  if bigram_frequencies.get((word_1, word_2), 0) ==0:
    return(1/unique_unigram_length)
  else:
    trigram_word_numerator = trigram_frequencies.get((word_1, word_2, word), 0) 
    trigram_word_denominator = bigram_frequencies.get((word_1, word_2)) 
    return(trigram_word_numerator/trigram_word_denominator)

def calculate_bigram_probability_LP(word_2, word):
  if unigram_frequencies.get(word_2, 0) ==0:
    return(1/unique_unigram_length)
  else:
    bigram_word_numerator = bigram_frequencies.get((word_2,word),0)
    bigram_word_denominator = unigram_frequencies.get(word_2)
    return(bigram_word_numerator/bigram_word_denominator)

#unigram_corpus = sum(unigram_frequencies.values())
def calculate_unigram_probability_LP(word):
  unigram_word_numerator = unigram_frequencies.get(word)
  unigram_word_denominator = unigram_corpus 
  return(unigram_word_numerator/unigram_word_denominator)



def calculate_interpolation_probability(word_1, word_2, word):
  tri_prob = calculate_trigram_probability_LP(word_1, word_2, word)
  bi_prob = calculate_bigram_probability_LP(word_2, word)
  uni_prob = calculate_unigram_probability_LP(word)
  res = lambda_1 * tri_prob + lambda_2 * bi_prob + lambda_3 * uni_prob
  return(res)

def calculate_interpolation_sentence_probability(sentence):
  interpolation_sentence_probability_log_sum = 0
  word_1 = None
  word_2 = None
  for word in sentence.split():
    #print(previous_word, word)
    if word_1 !=None and word_2 != None and word_2 !="</s>":
      interpolation_word_probability = calculate_interpolation_probability(word_1, word_2, word)
      interpolation_sentence_probability_log_sum += math.log(interpolation_word_probability, 2)
    word_1 = word_2
    word_2 = word
  return(interpolation_sentence_probability_log_sum)


def interpolation_calculate_perplexity(data):
  interpolation_log_loss = 0
  corpus = calculate_corpus(data)
  for sentence in data:
    interpolation_log_loss += calculate_interpolation_sentence_probability(sentence)
  interpolation_log_loss = interpolation_log_loss/corpus
  return(math.pow(2, -interpolation_log_loss))

In [0]:
lambda_list = [[0.1, 0.6, 0.3], [0.1, 0.3, 0.6], [0.3, 0.3, 0.4], [0.5, 0.3, 0.2], [0.7, 0.2,0.1]]
res_lambda = np.zeros((len(lambda_list), 3))
for i in range(len(lambda_list)):
  #print("lambda_1: %f, lambda_2: %f, lambda_3: %f " %(lambda_1, lambda_2, lambda_3) )
  lambda_1 = lambda_list[i][0]
  lambda_2 = lambda_list[i][1]
  lambda_3 = lambda_list[i][2]
  res_lambda[i][0] = interpolation_calculate_perplexity(train_text)
  res_lambda[i][1] = interpolation_calculate_perplexity(valid_text)
  res_lambda[i][2] = interpolation_calculate_perplexity(test_text)


In [167]:
res_lambda

array([[ 27.27246554, 266.29232132, 267.70988131],
       [ 34.12926476, 296.94173709, 299.10077405],
       [ 16.86568304, 281.69135512, 283.84840571],
       [ 11.46721632, 305.1541135 , 307.47101882],
       [  9.03785219, 386.60163337, 389.79324727]])

In [0]:
unique_unigram_length

27025

In [0]:
calculate_corpus(train_text)

972962

In [0]:
calculate_corpus(valid_text)

123380

In [0]:
calculate_corpus(test_text)

122190