In [8]:
import pandas as pd
import numpy as np
import nltk
from collections import defaultdict, Counter


In [3]:
from nltk import ngrams

Build the matrix of prefix - word frequencies.
Use the ngrams function from `nltk.utils` to generate all n-grams from the corpus
Set the following `left_pad_symbol = <s>` and `right_pad_symbol = </s>`

In [11]:
left_pad_symbol = "<s>"
right_pad_symbol = "</s>"

In [16]:
list(ngrams('test', 2, left_pad_symbol=left_pad_symbol, right_pad_symbol=right_pad_symbol))

[('t', 'e'), ('e', 's'), ('s', 't')]

In [66]:
# special tokens: 
# - unk represents absent tokens, 
# - eos is a special token after the end of sequence

UNK, EOS = "_UNK_", "_EOS_"

def count_ngrams(lines, n):
    """
    Count how many times each word occured after (n - 1) previous words
    :param lines: an iterable of strings with space-separated tokens
    :returns: a dictionary { tuple(prefix_tokens): {next_token_1: count_1, next_token_2: count_2}}

    When building counts, please consider the following two edge cases
    - if prefix is shorter than (n - 1) tokens, it should be padded with UNK. For n=3,
      empty prefix: "" -> (UNK, UNK)
      short prefix: "the" -> (UNK, the)
      long prefix: "the new approach" -> (new, approach)
    - you should add a special token, EOS, at the end of each sequence
      "... with deep neural networks ." -> (..., with, deep, neural, networks, ., EOS)
      count the probability of this token just like all others.
    """
    counts = defaultdict(Counter)
    # counts[(word1, word2)][word3] = how many times word3 occured after (word1, word2)
    for line in lines:
        for words in ngrams(line.split(), n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol=left_pad_symbol,
                            right_pad_symbol=right_pad_symbol):
            prefix, target = words[:-1], words[-1]
            counts[prefix][target] += 1
    return counts

In [113]:
freqs = count_ngrams(['test this planet hurrah test this planet', 'another test this planet', 'test this spoon'], 3)

In [114]:
freqs[('another', 'test')]['this']

1

Write a text generation function:  
- takes a bigram as input and generates the next token
- iteratively slide the prefix over the generated text so that the new prefix includes the most recent token; generates the next token
- to generate each next token, sample the list of words associated with the prefix using the probability distribution of the prefix.
- stop the text generation when a certain number of words have been generated or the latest token is a `</s>`.

In [115]:
sum(v for _, v in freqs[('test', 'this')].items())

4

In [116]:
bigram = ('test', 'this')
total_counts = sum(count for count in freqs[bigram].values())
total_counts

4

In [135]:
text

['test', 'this', 'spoon']

In [117]:
def create_model(freqs):
    # Let's transform the counts to probabilities
    for prefix in freqs:
        total_count = float(sum(freqs[prefix].values()))
        for w3 in freqs[prefix]:
            freqs[prefix][w3] /= total_count
    return freqs

In [118]:
model = create_model(freqs)

In [227]:
def generate_text(seed, model, max_length=10):
    assert len(seed) < max_length, "Max length must be greater than the length of the seed"
    sentence_finished = False

    while (not sentence_finished) and len(seed) <= max_length:
        probs = list(model[tuple(seed[-2:])].values())
        words = list(model[tuple(seed[-2:])].keys())
        seed.append(np.random.choice(words, p=probs))
        if seed[-2:] == ['</s>', '</s>']:
            sentence_finished = True
    return ' '.join([t for t in seed if t])

In [228]:
text = ["test", "this"]
generate_text(text, model)

'test this planet hurrah test this planet </s> </s>'

Write a function that can estimate the probability of a sentence and use it to select the most probable sentence out of several candidate sentences
- Split the sentence into trigrams and use the chain rule to calculate the probability of the sentence as a product of the bigrams - tokens probabilities