In [3]:
# Manipulate n_gram count dictionary

n_gram_counts = {
    ('i','am','happy') : 2,
    ('am','happy','because') : 1
}

# Get count for an n-gram tuple
print(f"count of n-gram {('i', 'am', 'happy')} : {n_gram_counts[('i', 'am', 'happy')]}")

# Check if n-gram is present in the dictionary
if ('i','am','learning') in n_gram_counts:
  print(f"n-gram {('i', 'am', 'learning')} found")
else:
  print(f"n-gram {('i', 'am', 'learning')} missing")

# Update the count in the word count dictionary
n_gram_counts[('i', 'am', 'learning')] = 1
if ('i', 'am', 'learning') in n_gram_counts:
    print(f"n-gram {('i', 'am', 'learning')} found")
else:
    print(f"n-gram {('i', 'am', 'learning')} missing")

count of n-gram ('i', 'am', 'happy') : 2
n-gram ('i', 'am', 'learning') missing
n-gram ('i', 'am', 'learning') found


In [6]:
# Concatenate tuple for prefix and tuple with the last word to create the n_gram
prefix = ('i', 'am', 'happy')
word = 'because'

# Note here the syntax for creating a tuple for a single word
n_gram = prefix + (word,)
print(n_gram)

('i', 'am', 'happy', 'because')


In [7]:
import numpy as np
import pandas as pd
from collections import defaultdict

def single_pass_trigram_count_matrix(corpus):

  bigrams = []
  vocabulary = []
  count_matrix_dict = defaultdict(dict)

  # Go through the corpus once with a sliding window
  for i in range(len(corpus) - 3 + 1):
    # Sliding window starts at position i and contains 3 words
    trigram = tuple(corpus[i : i+3])

    # "if not bigram in bigrams" is same as "if bigram not in bigrams"
    bigram = trigram[0 : -1]
    if not bigram in bigrams:
      bigrams.append(bigram)

    last_word = trigram[-1]
    if not last_word in vocabulary:
      vocabulary.append(last_word)

    if (bigram,last_word) not in count_matrix_dict:
      count_matrix_dict[bigram,last_word] = 0

    count_matrix_dict[bigram,last_word] += 1

  # Convert the count_matrix to np.array to fill in the blanks
  count_matrix = np.zeros((len(bigrams),len(vocabulary)))

  for trigram_key, trigram_count in count_matrix_dict.items():
    count_matrix[bigrams.index(trigram_key[0]),
                 vocabulary.index(trigram_key[1])] = trigram_count

  # np.array to pandas dataframe conversion
  count_matrix = pd.DataFrame(count_matrix,index=bigrams,columns = vocabulary)

  return bigrams,vocabulary,count_matrix

corpus = ['i', 'am', 'happy', 'because', 'i', 'am', 'learning', '.']

bigrams,vocabulary,count_matrix = single_pass_trigram_count_matrix(corpus)

print(count_matrix)

                  happy  because    i   am  learning    .
(i, am)             1.0      0.0  0.0  0.0       1.0  0.0
(am, happy)         0.0      1.0  0.0  0.0       0.0  0.0
(happy, because)    0.0      0.0  1.0  0.0       0.0  0.0
(because, i)        0.0      0.0  0.0  1.0       0.0  0.0
(am, learning)      0.0      0.0  0.0  0.0       0.0  1.0
