In [102]:
import pathlib
from tqdm.notebook import trange, tqdm

import pandas as pd
import numpy as np
import nltk
from nltk import ngrams
from collections import defaultdict, Counter, namedtuple
from utils import count_ngrams, create_model, sentence_perplexity

Split the dataset into a training and a testing subset. Use the category “title” as the testing set and the categories “comment” and “post” as the training set. The short length of titles will make them good candidates later on as seeds for text generation.

In [103]:
PROCESSED_DATA_DIR = pathlib.Path('../data/processed/')

In [104]:
so = pd.read_csv(PROCESSED_DATA_DIR / 'tokenized.csv')

In [105]:
so = so.loc[so.text.dropna().index]

In [106]:
train = so.query("category != 'title'")
test = so.query("category == 'title'")

In [107]:
small_train = train.sample(100, random_state=0)
small_test = test.sample(10, random_state=0)

In [116]:
sentences = ['test this planet hurrah test this planet', 'another test this planet', 'test this spoon']
vocab_set = set(' '.join(sentences))

In [117]:
small_vocab_set = set(' '.join(small_train.text.tolist()))

Build the matrix of prefix - word frequencies.
Use the ngrams function from `nltk.utils` to generate all n-grams from the corpus
Set the following `left_pad_symbol = <s>` and `right_pad_symbol = </s>`

In [118]:
Padding = namedtuple('Padding', ['left_pad_symbol', 'right_pad_symbol'])

In [119]:
PADDING = Padding("<s>", "</s>")

In [120]:
list(ngrams('test', 2, left_pad_symbol=PADDING.left_pad_symbol, right_pad_symbol=PADDING.right_pad_symbol))

[('t', 'e'), ('e', 's'), ('s', 't')]

In [122]:
freqs = count_ngrams(sentences, 3)

Sentences: 100%|██████████| 3/3 [00:00<00:00, 194.78it/s]


In [123]:
freqs[('another', 'test')]['this']

1

In [124]:
freqs

defaultdict(collections.Counter,
            {('<s>', '<s>'): Counter({'test': 2, 'another': 1}),
             ('<s>', 'test'): Counter({'this': 2}),
             ('test', 'this'): Counter({'planet': 3, 'spoon': 1}),
             ('this', 'planet'): Counter({'hurrah': 1, '</s>': 2}),
             ('planet', 'hurrah'): Counter({'test': 1}),
             ('hurrah', 'test'): Counter({'this': 1}),
             ('planet', '</s>'): Counter({'</s>': 2}),
             ('<s>', 'another'): Counter({'test': 1}),
             ('another', 'test'): Counter({'this': 1}),
             ('this', 'spoon'): Counter({'</s>': 1}),
             ('spoon', '</s>'): Counter({'</s>': 1})})

Write a text generation function:  
- takes a bigram as input and generates the next token
- iteratively slide the prefix over the generated text so that the new prefix includes the most recent token; generates the next token
- to generate each next token, sample the list of words associated with the prefix using the probability distribution of the prefix.
- stop the text generation when a certain number of words have been generated or the latest token is a `</s>`.

In [125]:
sum(v for _, v in freqs[('test', 'this')].items())

4

In [126]:
bigram = ('test', 'this')
total_counts = sum(count for count in freqs[bigram].values())
total_counts

4

In [128]:
model = create_model(freqs, len(vocab_set), delta=1)

100%|██████████| 11/11 [00:00<00:00, 4556.77it/s]


In [129]:
def generate_text(seed, model, max_length=10):
    """Takes a bigram as input and generates the next token"""
    assert len(seed) < max_length, "Max length must be greater than the length of the seed"
    sentence_finished = False

    while (not sentence_finished) and len(seed) <= max_length:
        probs = list(model[tuple(seed[-2:])].values())
        words = list(model[tuple(seed[-2:])].keys())
        print(probs)
        if not words:
            break
        seed.append(np.random.choice(words, p=probs))
        if seed[-2:] == ['</s>', '</s>']:
            sentence_finished = True
    return ' '.join([t for t in seed if t not in PADDING])

In [130]:
model

defaultdict(collections.Counter,
            {('<s>', '<s>'): Counter({'test': 0.1875, 'another': 0.125}),
             ('<s>', 'test'): Counter({'this': 0.2}),
             ('test',
              'this'): Counter({'planet': 0.23529411764705882,
                      'spoon': 0.11764705882352941}),
             ('this', 'planet'): Counter({'hurrah': 0.125, '</s>': 0.1875}),
             ('planet', 'hurrah'): Counter({'test': 0.14285714285714285}),
             ('hurrah', 'test'): Counter({'this': 0.14285714285714285}),
             ('planet', '</s>'): Counter({'</s>': 0.2}),
             ('<s>', 'another'): Counter({'test': 0.14285714285714285}),
             ('another', 'test'): Counter({'this': 0.14285714285714285}),
             ('this', 'spoon'): Counter({'</s>': 0.14285714285714285}),
             ('spoon', '</s>'): Counter({'</s>': 0.14285714285714285})})

In [131]:
text = ["test", "this"]
generate_text(text, model)

[0.23529411764705882, 0.11764705882352941]


ValueError: probabilities do not sum to 1

Write a function that can estimate the probability of a sentence and use it to select the most probable sentence out of several candidate sentences
- Split the sentence into trigrams and use the chain rule to calculate the probability of the sentence as a product of the bigrams - tokens probabilities

A sequence of $n$ words:

$$w_1^n = w_1 \cdots w_n$$

Joint probability:

$$P(w_1, w_2, \cdots, w_n)$$

Chain Rule:
$$P(w_1^n) = \prod_{k=1}^n P(w_k \mid w_1^{k-1})$$

Markov approximation for trigrams:
$$P(w_n \mid w_1^{n-1}) \approx P(w_n \mid w_{n-2}^{n-1})$$

In [82]:
def is_unknown(model, words):
    prefix, target = words[:-1], words[-1]
    return model[prefix][target] == 0

In [83]:
def compute_missing_prob(model, words, vocab_set):
    prefix, target = words[:-1], words[-1]
    token_probs = model[prefix]
    missing_prob_total = 1.0 - sum(token_probs.values())
    missing_prob_total = max(0, missing_prob_total) # prevent rounding errors
    missing_prob_total = missing_prob_total / max(1, len(vocab_set) - len(token_probs))
    return missing_prob_total

In [84]:
def sentence_prob(model, sentence, vocab_set, min_logprob=np.log(10 ** -50.)):
    sentence_split = sentence.split()
    num_words = len(sentence_split)
    trigrams = ngrams(
        sentence_split, 3, 
        pad_left=True,
        pad_right=True,
        left_pad_symbol=PADDING.left_pad_symbol,
        right_pad_symbol=PADDING.right_pad_symbol
    )
    probs = []
    for words in trigrams:
        prefix, target = words[:-1], words[-1]
        if is_unknown(model, words):
            missing_prob_total = compute_missing_prob(model, words, vocab_set)
            if missing_prob_total == 0 or np.log(missing_prob_total) < min_logprob:
                probs.append(min_logprob)
            else:
                probs.append(np.log(missing_prob_total))
        else:
            probs.append(np.log(model[prefix][target]))
    # return product(probs)
    return np.exp(np.sum((probs))*(-1/num_words))

In [85]:
sentence_perplexity(model, 'test this spoon')

1.8171205928321397

In [86]:
sentence_prob(model, 'test this spoon', vocab_set)

1.8171205928321397

In [32]:
model[('test', 'this')]

Counter({'planet': 0.75, 'spoon': 0.25})

In [33]:
sentences = ['test this planet', 'test this planet hurrah']

In [48]:
from functools import partial

min(sentences, key=lambda x: sentence_prob(model, x, vocab_set))

'test this planet'

Implement the perplexity scoring function for a given sentence and for the training corpus.

In [49]:
def perplexity(model, sentence, vocab_set, min_logprob=np.log(10 ** -50.)):
    """
    :param min_logprob: if log(P(w | ...)) is smaller than min_logprop, set it equal to min_logrob
    :returns: perplexity of a sentence - scalar
    
    Note: do not forget to compute P(w_first | empty) and P(eos | full_sequence)
    
    """
    return sentence_prob(model, sentence, vocab_set)

In [52]:
perplexity(model, 'test this spoon', vocab_set)

1.8171205928321397

In [53]:
model

defaultdict(collections.Counter,
            {('<s>',
              '<s>'): Counter({'test': 0.6666666666666666,
                      'another': 0.3333333333333333}),
             ('<s>', 'test'): Counter({'this': 1.0}),
             ('test', 'this'): Counter({'planet': 0.75, 'spoon': 0.25}),
             ('this',
              'planet'): Counter({'hurrah': 0.3333333333333333,
                      '</s>': 0.6666666666666666}),
             ('planet', 'hurrah'): Counter({'test': 1.0}),
             ('hurrah', 'test'): Counter({'this': 1.0}),
             ('planet', '</s>'): Counter({'</s>': 1.0}),
             ('<s>', 'another'): Counter({'test': 1.0}),
             ('another', 'test'): Counter({'this': 1.0}),
             ('this', 'spoon'): Counter({'</s>': 1.0}),
             ('spoon', '</s>'): Counter({'</s>': 1.0}),
             ('hurrah', '</s>'): Counter(),
             ('this', '</s>'): Counter(),
             ('<s>', 'as'): Counter(),
             ('as', 'an'): Counter(),
    

### StackOverflow Data

In [54]:
freqs = count_ngrams(small_train.text.tolist(), 3)

Sentences: 100%|██████████| 100/100 [00:00<00:00, 2724.94it/s]


In [55]:
model = create_model(freqs, len(vocab_set))

100%|██████████| 5085/5085 [00:00<00:00, 538613.97it/s]


In [56]:
text = ["as", "an"]
generate_text(text, model)

['effect', 'explanatory', '*approximation*']
['variable']
['in']
['a']
['Poisson', 'regression', 'country/state']
['of', 'model']
['and', 'in']
['the']
['structure', '``', 'exponential', 'combined', 'results', 'statistical', 'city', 'real', 'power']


'as an explanatory variable in a regression model and the structure'

Calculate the perplexity of the language model on the test set composed of titles.

In [57]:
tqdm.pandas()

  from pandas import Panel


In [58]:
perplexities = small_test.text.progress_apply(lambda x: perplexity(model, x, vocab_set))

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [59]:
def corpus_perplexity(perplexities, vocab_size):
    return np.prod(perplexities)**(-1/vocab_size)

In [60]:
corpus_perplexity(perplexities, len(vocab_set))

0.7309574178771594

In [62]:
perplexity(model, 'as an explanatory variable in a regression model and the structure', vocab_set)

7123372058.461836