#Exercises:

#3.8 Write a program to compute unsmoothed unigrams and bigrams
#3.10 Add an option to your program to generate random sentences
#3.11 Add an option to your program to compute the perplexity of a test set

In [6]:
from collections import defaultdict, Counter
import math
import random

class Unigrams():
  def __init__(self):
    self.unigrams = {}
    self.size = 0

  def fit(self, sentences):
    #Count the frequencies
    for sentence in sentences:
      for token in sentence:
        if token not in self.unigrams:
          self.unigrams[token] = 1
        else:
          self.unigrams[token] += 1
    self.size = sum(self.unigrams.values())
    #Normalize the frequencies
    for token in self.unigrams.keys():
      self.unigrams[token] = self.unigrams[token] / self.size

  def generate(self, size):
    tokens = []
    while len(tokens) < size:
      word = random.choices(list(self.unigrams.keys()), weights=list(self.unigrams.values()))[0]
      if word in ".!?;":
        break
      tokens.append(word)
    return " ".join(tokens)


  def perplexity(self, text):
    log_prob = 0
    epsilon = 1e-8
    tokens = text
    for token in tokens:
      if token in self.unigrams:
        log_prob += math.log(self.unigrams[token] + epsilon)

    ppl = math.exp(-log_prob / len(tokens))
    return ppl



class N_Grams():

  def __init__(self, N = 2):
    if N < 2:
      raise ValueError("N must be greater than 1")

    self.N = N
    self.n_grams = defaultdict(lambda: defaultdict(int))
    self.n_minus_1_grams = defaultdict(int)
    self.indexMapping = {"<s>": 0, "</s>": 1}
    self.wordMapping = {0: "<s>", 1: "</s>"}
    self.seq_to_indexMapping = {"<s>" * (N - 1): 0}
    self.index_to_seqMapping = {0: "<s>" * (N - 1)}

  def fit(self, sentences):

    #Split the corpus into tokens and get the word-index mappings and seq-index mapping
    wordIndex = 2
    seqIndex = 1
    for sentence in sentences:
      tokens = ["<s>" for _ in range(self.N - 1)] + sentence + ["</s>"]

      for i in range(self.N - 1, len(tokens)):
        token = tokens[i]
        if token not in self.indexMapping:
          self.indexMapping[token] = wordIndex
          self.wordMapping[wordIndex] = token
          wordIndex += 1

        prev_seq = tokens[i - self.N + 1: i]
        prev_seq = " ".join(prev_seq)
        if prev_seq not in self.seq_to_indexMapping:
          self.seq_to_indexMapping[prev_seq] = seqIndex
          self.index_to_seqMapping[seqIndex] = prev_seq
          seqIndex += 1

    sos_index = self.indexMapping["<s>"]
    eos_index = self.indexMapping["</s>"]
    vocab_size = len(self.indexMapping)
    seq_size = len(self.seq_to_indexMapping)

    #Get the frequencies
    for sentence in sentences:
      tokens = ["<s>" for _ in range(self.N - 1)] + sentence + ["</s>"]
      for i in range(self.N - 1, len(tokens)):
        token = tokens[i]
        prev_seq = tokens[i - self.N + 1: i]
        prev_seq = " ".join(prev_seq)

        token_index = self.indexMapping[token]
        prev_seq_index = self.seq_to_indexMapping[prev_seq]

        self.n_grams[prev_seq_index][token_index] += 1
        self.n_minus_1_grams[prev_seq_index] += 1

    #Normalize the frequencies
    for prev_seq in range(seq_size):
      for token in range(vocab_size):
        self.n_grams[prev_seq][token] = self.n_grams[prev_seq][token] / self.n_minus_1_grams[prev_seq]

  def generate(self, size):
    prev_seq = ["<s>" for _ in range(self.N - 1)]
    tokens = []

    while len(tokens) < size:
      prev_seq_index = self.seq_to_indexMapping[" ".join(prev_seq)]
      available_choices = list(self.n_grams[prev_seq_index].keys())
      weights = list(self.n_grams[prev_seq_index].values())
      next_token_index = random.choices(available_choices, weights=weights, k = 1)[0]
      next_token = self.wordMapping[next_token_index]
      if next_token == "</s>":
        break

      tokens.append(next_token)
      prev_seq = prev_seq[1:] + [next_token]

    return " ".join(tokens)

  def perplexity(self, sentence):
    sentence = ["<s>" for _ in range(self.N - 1)] + sentence + ["</s>"]
    log_probs = 0
    epsilon = 1e-8
    for i in range(self.N - 1, len(sentence)):
      prev_seq = sentence[i - self.N + 1: i]
      prev_seq = " ".join(prev_seq)
      prev_seq_index = self.seq_to_indexMapping[prev_seq]
      token_index = self.indexMapping[sentence[i]]
      log_probs += math.log(self.n_grams[prev_seq_index][token_index] + epsilon)

    ppl = math.exp(-log_probs / len(sentence))
    return ppl

In [7]:
import nltk
nltk.download('brown')
nltk.download('treebank')
from nltk.corpus import treebank
from nltk.corpus import brown

brown_sentences = brown.sents()
treebank_sentences = treebank.sents()

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [12]:
#Bigrams train on Brown' corpus
bigrams_brown = N_Grams(2)
bigrams_brown.fit(brown_sentences[:1000])

#Bigrams train on TreeBank corpus
bigrams_treebank = N_Grams(2)
bigrams_treebank.fit(treebank_sentences[:1000])

#Unigrams train on Brown corpus
unigrams_brown = Unigrams()
unigrams_brown.fit(brown_sentences[:1000])

#Unigrams train on treebank corpus
unigrams_treebank = Unigrams()
unigrams_treebank.fit(treebank_sentences[:1000])

In [14]:
print("Bigrams trained on Brown corpus:")
print(bigrams_brown.generate(20))
print("Bigrams trained on TreeBank corpus:")
print(bigrams_treebank.generate(20))
print("Unigrams trained on Brown corpus:")
print(unigrams_brown.generate(20))
print("Unigrams trained on TreeBank corpus:")
print(unigrams_treebank.generate(20))

Bigrams trained on Brown corpus:
-- A & I didn't have been cooperating .
Bigrams trained on TreeBank corpus:
Yasser Arafat has surfaced in markets , on turnover was an early * evaluating teachers , the deal with a
Unigrams trained on Brown corpus:
obtain legislature and statement , ' the take that Club state's both one '' p.m. would the scramble there court's
Unigrams trained on TreeBank corpus:
's Donoghue meet rate * a and dollars all refund when to to , found than `` *T*-2 have said
