In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.data.path.append('.')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
with open('en_US.twitter.txt','r') as f:
  data = f.read()

In [3]:
len(data) # total letters

3335477

In [4]:
def split_data(data):
  sentences = data.split('\n')
  sentences = [s.strip() for s in sentences]
  sentences = [s for s in sentences if len(s)>0]
  return sentences

In [5]:
def tokenize(sentences):
  words = []
  for sentence in sentences:
    words.append(nltk.word_tokenize(sentence.lower()))
  return words

In [6]:
sentences = split_data(data)
len(sentences) # total sentences

47961

In [7]:
tokenized_data = tokenize(sentences)
tokenized_data[0] # tokens for first sentence

['how',
 'are',
 'you',
 '?',
 'btw',
 'thanks',
 'for',
 'the',
 'rt',
 '.',
 'you',
 'gon',
 'na',
 'be',
 'in',
 'dc',
 'anytime',
 'soon',
 '?',
 'love',
 'to',
 'see',
 'you',
 '.',
 'been',
 'way',
 ',',
 'way',
 'too',
 'long',
 '.']

In [8]:
random.shuffle(tokenized_data)
train_size = int(len(tokenized_data) * 0.8)
train_data = tokenized_data[0:train_size]
test_data = tokenized_data[train_size:]

In [9]:
def count_words(sentences):
  word_count = {}
  for sentence in sentences:
    for word in sentence:
      if word not in word_count.keys():
        word_count[word] = 1
      else:
        word_count[word]+=1
  return word_count

In [10]:
def get_word_frequency(tokenized_sentences,threshold):
  closed_vocab = []
  word_count = count_words(tokenized_sentences)
  for word,freq in word_count.items():
    if freq>=threshold:
      closed_vocab.append(word)
  return closed_vocab

In [11]:
def replace_oov_with_unk(tokenized_sentences,vocab,unknown_token='<unk>'):
  replaced_sentence = []
  for sentence in tokenized_sentences:
    modified_sentence = []
    for word in sentence:
      if word in vocab:
        modified_sentence.append(word)
      else:
        modified_sentence.append(unknown_token)
    replaced_sentence.append(modified_sentence)
  return replaced_sentence

In [12]:
def preprocess_data(train_data,test_data,threshold):
  vocab = get_word_frequency(train_data,threshold)
  train_data_replaced = replace_oov_with_unk(train_data,vocab)
  test_data_replaced = replace_oov_with_unk(test_data,vocab)
  return vocab,train_data_replaced,test_data_replaced

In [13]:
vocab,train_data_processed, test_data_processed = preprocess_data(train_data, test_data,2)

In [14]:
len(vocab)

14884

In [15]:
def count_n_grams(data,n,start_token='<s>',end_token='<e>'):
  n_grams = {}
  for sentence in data:
    modified_sentence = [start_token] * n + sentence + [end_token]
    sent = tuple(modified_sentence)
    m = len(modified_sentence) if n==1 else len(modified_sentence)-1
    for i in range(m):
      n_gram = sent[i:i+n]
      if n_gram in n_grams.keys():
        n_grams[n_gram]+=1
      else:
        n_grams[n_gram]=1
  return n_grams

In [16]:
def prob(word,previous_n_gram,n_gram_count,nplus1_gram_count,vocab_size,k=1):
  prev_n_gram = tuple(previous_n_gram)
  nplus1_gram = prev_n_gram + (word,)
  
  if nplus1_gram in nplus1_gram_count:
    num = nplus1_gram_count[nplus1_gram] + k
  else:
    num = k

  if prev_n_gram in n_gram_count:
    den = n_gram_count[prev_n_gram] + (k * vocab_size)
  else:
    den = k * vocab_size  

  prob = num/den
  return prob

In [17]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    
    probabilities = {}
    for word in vocabulary:
        probability = prob(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=k)
        probabilities[word] = probability

    return probabilities

In [18]:
def make_count_matrix(n_plus1_gram_counts, vocabulary):
    vocabulary = vocabulary + ["<e>", "<unk>"]

    n_grams = []
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j, word in enumerate(vocabulary)}

    nrow = len(n_grams)
    ncol = len(vocabulary)
    count_matrix = np.zeros((nrow, ncol))
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
    
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocabulary)
    return count_matrix

In [19]:
def make_probability_matrix(n_plus1_gram_counts, vocabulary, k):
    count_matrix = make_count_matrix(n_plus1_gram_counts, vocabulary)
    count_matrix += k # Smoothing
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    return prob_matrix

In [20]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    n = len(list(n_gram_counts.keys())[0]) 
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    N = len(sentence)
    
    product_pi = 1.0
    
    for t in range(n, N): 
        n_gram = sentence[t-n:t]
        word = sentence[t]

        probability = prob(word,n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1)

        product_pi *= 1 / probability

    perplexity = product_pi**(1/float(N))
    return perplexity

In [21]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)


perplexity_train1 = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


In [22]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0])
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,n_gram_counts, n_plus1_gram_counts,vocabulary, k=k)
    suggestion = None
    max_prob = 0

    for word, prob in probabilities.items(): 
        if start_with != None: 
            if not word.startswith(start_with): 
                continue 
        
        if prob > max_prob: 
            suggestion = word
            max_prob = prob

    return suggestion, max_prob

In [23]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}\n")
tmp_starts_with = 'c'
tmp_suggest2 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{tmp_suggest2[0]}` with a probability")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability
