In [400]:
import pathlib
from tqdm.notebook import trange, tqdm
from tqdm.notebook import trange, tqdm

import pandas as pd
import numpy as np
import nltk
from nltk import ngrams
from collections import defaultdict, Counter, namedtuple


In [537]:
# TODO: Make a subset of the corpus for debugging

Split the dataset into a training and a testing subset. Use the category “title” as the testing set and the categories “comment” and “post” as the training set. The short length of titles will make them good candidates later on as seeds for text generation.

In [392]:
PROCESSED_DATA_DIR = pathlib.Path('../data/processed/')

In [393]:
so = pd.read_csv(PROCESSED_DATA_DIR / 'tokenized.csv')

In [433]:
so = so.loc[so.text.dropna().index]

In [434]:
train = so.query("category != 'title'")
test = so.query("category == 'title'")

Build the matrix of prefix - word frequencies.
Use the ngrams function from `nltk.utils` to generate all n-grams from the corpus
Set the following `left_pad_symbol = <s>` and `right_pad_symbol = </s>`

In [250]:
left_pad_symbol = "<s>"
right_pad_symbol = "</s>"

Padding = namedtuple('Padding', ['left_pad_symbol', 'right_pad_symbol'])

In [253]:
PADDING = Padding("<s>", "</s>")

In [254]:
list(ngrams('test', 2, left_pad_symbol=PADDING.left_pad_symbol, right_pad_symbol=PADDING.right_pad_symbol))

[('t', 'e'), ('e', 's'), ('s', 't')]

In [407]:
# Taken from https://github.com/yandexdataschool/nlp_course/blob/master/week03_lm/seminar.ipynb
#
# special tokens: 
# - unk represents absent tokens, 
# - eos is a special token after the end of sequence

UNK, EOS = "_UNK_", "_EOS_"

def count_ngrams(lines, n):
    """
    Count how many times each word occured after (n - 1) previous words
    :param lines: an iterable of strings with space-separated tokens
    :returns: a dictionary { tuple(prefix_tokens): {next_token_1: count_1, next_token_2: count_2}}

    When building counts, please consider the following two edge cases
    - if prefix is shorter than (n - 1) tokens, it should be padded with UNK. For n=3,
      empty prefix: "" -> (UNK, UNK)
      short prefix: "the" -> (UNK, the)
      long prefix: "the new approach" -> (new, approach)
    - you should add a special token, EOS, at the end of each sequence
      "... with deep neural networks ." -> (..., with, deep, neural, networks, ., EOS)
      count the probability of this token just like all others.
    """
    counts = defaultdict(Counter)
    # counts[(word1, word2)][word3] = how many times word3 occured after (word1, word2)
    for line in tqdm(lines, desc='Sentences'):
        words = ngrams(line.split(), n,
                            pad_left=True,
                            pad_right=True,
                            left_pad_symbol=PADDING.left_pad_symbol,
                            right_pad_symbol=PADDING.right_pad_symbol)
        for word in words:
            prefix, target = word[:-1], word[-1]
            counts[prefix][target] += 1
    
    return counts

In [408]:
freqs = count_ngrams(['test this planet hurrah test this planet', 'another test this planet', 'test this spoon'], 3)

HBox(children=(FloatProgress(value=0.0, description='Sentences', max=3.0, style=ProgressStyle(description_widt…




In [114]:
freqs[('another', 'test')]['this']

1

Write a text generation function:  
- takes a bigram as input and generates the next token
- iteratively slide the prefix over the generated text so that the new prefix includes the most recent token; generates the next token
- to generate each next token, sample the list of words associated with the prefix using the probability distribution of the prefix.
- stop the text generation when a certain number of words have been generated or the latest token is a `</s>`.

In [115]:
sum(v for _, v in freqs[('test', 'this')].items())

4

In [116]:
bigram = ('test', 'this')
total_counts = sum(count for count in freqs[bigram].values())
total_counts

4

In [135]:
text

['test', 'this', 'spoon']

In [527]:
def create_model(freqs, vocab_size, delta=1.0):
    """Transform the counts to probabilities"""
    for prefix in tqdm(freqs):
        token_counts = freqs[prefix]
        total_count = float(sum(token_counts.values()) + delta * vocab_size)
        for w3 in freqs[prefix]:
            token_counts[w3] = (token_counts[w3] + delta) / total_count
    return freqs

In [None]:
model = create_model(freqs, len(vocab_set))

In [260]:
def generate_text(seed, model, max_length=10):
    """Takes a bigram as input and generates the next token"""
    assert len(seed) < max_length, "Max length must be greater than the length of the seed"
    sentence_finished = False

    while (not sentence_finished) and len(seed) <= max_length:
        probs = list(model[tuple(seed[-2:])].values())
        words = list(model[tuple(seed[-2:])].keys())
        seed.append(np.random.choice(words, p=probs))
        if seed[-2:] == ['</s>', '</s>']:
            sentence_finished = True
    return ' '.join([t for t in seed if t not in PADDING])

In [261]:
text = ["test", "this"]
generate_text(text, model)

'test this planet hurrah test this spoon'

Write a function that can estimate the probability of a sentence and use it to select the most probable sentence out of several candidate sentences
- Split the sentence into trigrams and use the chain rule to calculate the probability of the sentence as a product of the bigrams - tokens probabilities

A sequence of $n$ words:

$$w_1^n = w_1 \cdots w_n$$

Joint probability:

$$P(w_1, w_2, \cdots, w_n)$$

Chain Rule:
$$P(w_1^n) = \prod_{k=1}^n P(w_k \mid w_1^{k-1})$$

Markov approximation for trigrams:
$$P(w_n \mid w_1^{n-1}) \approx P(w_n \mid w_{n-2}^{n-1})$$

In [339]:
def product(array):
    """Numerically stable product"""
    return np.exp(np.sum(np.log(array)))

In [477]:
def sentence_prob(model, sentence):
    words = sentence.split()
    trigrams = ngrams(
        words, 3, 
        left_pad_symbol=PADDING.left_pad_symbol,
        right_pad_symbol=PADDING.right_pad_symbol
    )
    probs = []
    for words in trigrams:
        prefix, target = words[:-1], words[-1]
        probs.append(model[prefix][target])
    # return product(probs)
    return np.prod(probs)

In [482]:
sentence_prob(model, 'test this planet hurrah')

0

In [377]:
model[('test', 'this')]

Counter({'planet': 0.75, 'spoon': 0.25})

In [378]:
sentences = ['test this planet', 'test this planet hurrah']

In [379]:
from functools import partial

sentence_prob_model = partial(sentence_prob, model=model)
max(sentences, key=sentence_prob_model)

'test this planet'

Implement the perplexity scoring function for a given sentence and for the training corpus.

In [473]:
def perplexity(model, sentence, min_logprob=np.log(10 ** -50.)):
    """
    :param min_logprob: if log(P(w | ...)) is smaller than min_logprop, set it equal to min_logrob
    :returns: perplexity of a sentence - scalar
    
    Note: do not forget to compute P(w_first | empty) and P(eos | full_sequence)
    
    """
    N = len(sentence.split())
    value = sentence_prob(model, sentence)**(-1/N)
    return max(value, min_logprob)

In [483]:
perplexity(model, 'test this')

1.0

### StackOverflow Data

In [437]:
freqs = count_ngrams(train.text.tolist(), 3)

HBox(children=(FloatProgress(value=0.0, description='Sentences', max=653552.0, style=ProgressStyle(description…




In [501]:
vocab_set = set(' '.join(train.text.tolist()))

In [528]:
model = create_model(freqs, len(vocab_set))

HBox(children=(FloatProgress(value=0.0, max=3590926.0), HTML(value='')))




In [None]:
text = ["test", "this"]
generate_text(text, model)

Calculate the perplexity of the language model on the test set composed of titles.

In [None]:
tqdm.pandas()

In [530]:
perplexities = test.text.progress_apply(lambda x: perplexity(model, x))

HBox(children=(FloatProgress(value=0.0, max=78096.0), HTML(value='')))




In [531]:
def corpus_perplexity(perplexities, vocab_size):
    return np.prod(perplexities)**(-1/vocab_size)

In [534]:
corpus_perplexity(perplexities, len(vocab_set))

0.0

In [536]:
perplexities

0        222.464352
1        124.677837
2        143.833772
3         50.165601
4               inf
            ...    
78091           inf
78092           inf
78093    217.444018
78094           inf
78095           inf
Name: text, Length: 78096, dtype: float64