In [1]:
# Manipulate n_gram count dictionary

n_gram_counts = {
    ('i','am','happy') : 2,
    ('am','happy','because') : 1
}

# Get count for an n-gram tuple
print(f"count of n-gram {('i', 'am', 'happy')} : {n_gram_counts[('i', 'am', 'happy')]}")

# Check if n-gram is present in the dictionary
if ('i','am','learning') in n_gram_counts:
  print(f"n-gram {('i', 'am', 'learning')} found")
else:
  print(f"n-gram {('i', 'am', 'learning')} missing")

# Update the count in the word count dictionary
n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

count of n-gram ('i', 'am', 'happy') : 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [2]:
# Concatenate tuple for prefix and tuple with the last word to create the n_gram
prefix = ('i', 'am', 'happy')
word = 'because'

# Note here the syntax for creating a tuple for a single word
n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [3]:
import numpy as np
import pandas as pd
from collections import defaultdict

def single_pass_trigram_count_matrix(corpus):

  bigrams = []
  vocabulary = []
  count_matrix_dict = defaultdict(dict)

  # Go through the corpus once with a sliding window
  for i in range(len(corpus) - 3 + 1):
    # Sliding window starts at position i and contains 3 words
    trigram = tuple(corpus[i : i+3])

    # "if not bigram in bigrams" is same as "if bigram not in bigrams"
    bigram = trigram[0 : -1]
    if not bigram in bigrams:
      bigrams.append(bigram)

    last_word = trigram[-1]
    if not last_word in vocabulary:
      vocabulary.append(last_word)

    if (bigram,last_word) not in count_matrix_dict:
      count_matrix_dict[bigram,last_word] = 0

    count_matrix_dict[bigram,last_word] += 1

  # Convert the count_matrix to np.array to fill in the blanks
  count_matrix = np.zeros((len(bigrams),len(vocabulary)))

  for trigram_key, trigram_count in count_matrix_dict.items():
    count_matrix[bigrams.index(trigram_key[0]),
                 vocabulary.index(trigram_key[1])] = trigram_count

  # np.array to pandas dataframe conversion
  count_matrix = pd.DataFrame(count_matrix,index=bigrams,columns = vocabulary)

  return bigrams,vocabulary,count_matrix

corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

bigrams,vocabulary,count_matrix = single_pass_trigram_count_matrix(corpus)

print(count_matrix)

                  happy  because    i   am  learning    .
(i, am)             1.0      0.0  0.0  0.0       1.0  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


### Probability Matrix

In [7]:
# Probability matrix
# Create probability matrix from the count matrix
row_sums = count_matrix.sum(axis = 1)

# Divide each row by its sum
prob_matrix = count_matrix.div(row_sums,axis = 0)

print(prob_matrix)

                  happy  because    i   am  learning    .
(i, am)             0.5      0.0  0.0  0.0       0.5  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0


In [9]:
# Find the probablity of a trigram in the probability matrix
trigram = ('i', 'am', 'happy')

# Find the prefix bigram
bigram = trigram[0 : -1]
print("bigram : ", bigram)

# Find the last word of the trigram
word = trigram[-1]
print("Word : ",word)

trigram_prob = prob_matrix[word][bigram]
print(f"Trigram probability : {trigram_prob}")

bigram :  ('i', 'am')
Word :  happy
Trigram probability : 0.5


In [10]:
# lists all words in vocabulary starting with a given prefix
vocabulary = ['i', 'am', 'happy', 'because', 'learning', '.', 'have', 'you', 'seen','it', '?']
starts_with = 'ha'

print(f"Words in vocabulary starting with prefix : {starts_with}\n")
for word in vocabulary:
  if word.startswith(starts_with):
    print(word)

Words in vocabulary starting with prefix : ha

happy
have


### Language Model Evaluation

In [12]:
# Train/Validation/Test Split
import random
def train_validation_test_split(data,train_percent,validation_percent):

  # Fixed seed here for reproducibility
  random.seed(87)

  # Reshuffle all input sentences
  random.shuffle(data)

  # Need to /100 because train_percent is between 0 - 100
  train_size = int(len(data) * train_percent / 100)
  train_data = data[0: train_size]

  validation_size = int(len(data) * validation_percent / 100)
  validation_data = data[train_size : train_size + validation_size]

  test_data = data[train_size + validation_size : ]

  return train_data,validation_data,test_data

data = [x for x in range (0, 100)]

train_data, validation_data, test_data = train_validation_test_split(data, 80, 10)
print("split 80/10/10:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n",
      f"test data:{test_data}\n")

train_data, validation_data, test_data = train_validation_test_split(data, 98, 1)
print("split 98/1/1:\n",f"train data:{train_data}\n", f"validation data:{validation_data}\n",
      f"test data:{test_data}\n")

split 80/10/10:
 train data:[28, 76, 5, 0, 62, 29, 54, 95, 88, 58, 4, 22, 92, 14, 50, 77, 47, 33, 75, 68, 56, 74, 43, 80, 83, 84, 73, 93, 66, 87, 9, 91, 64, 79, 20, 51, 17, 27, 12, 31, 67, 81, 7, 34, 45, 72, 38, 30, 16, 60, 40, 86, 48, 21, 70, 59, 6, 19, 2, 99, 37, 36, 52, 61, 97, 44, 26, 57, 89, 55, 53, 85, 3, 39, 10, 71, 23, 32, 25, 8]
 validation data:[78, 65, 63, 11, 49, 98, 1, 46, 15, 41]
 test data:[90, 96, 82, 42, 35, 13, 69, 24, 94, 18]

split 98/1/1:
 train data:[66, 23, 29, 28, 52, 87, 70, 13, 15, 2, 62, 43, 82, 50, 40, 32, 30, 79, 71, 89, 6, 10, 34, 78, 11, 49, 39, 42, 26, 46, 58, 96, 97, 8, 56, 86, 33, 93, 92, 91, 57, 65, 95, 20, 72, 3, 12, 9, 47, 37, 67, 1, 16, 74, 53, 99, 54, 68, 5, 18, 27, 17, 48, 36, 24, 45, 73, 19, 41, 59, 21, 98, 0, 31, 4, 85, 80, 64, 84, 88, 25, 44, 61, 22, 60, 94, 76, 38, 77, 81, 90, 69, 63, 7, 51, 14, 55, 83]
 validation data:[35]
 test data:[75]



### Perplexity

Perplexity formula:

\begin{equation*}
PP(W)=\sqrt[M]{\prod_{i=1}^{m}{\frac{1}{P(w_i|w_{i-1})}}}
\end{equation*}

Remember from calculus:

\begin{equation*}
\sqrt[M]{\frac{1}{x}} = x^{-\frac{1}{M}}
\end{equation*}

In [None]:
p = 10 ** (-250)
M = 100
perplexity = p ** (-1/M)
print(perplexity)