In [48]:
from tqdm import tqdm
import nltk
nltk.download('punkt')
nltk.download('gutenberg')
nltk.download('cess_cat')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package cess_cat to /root/nltk_data...
[nltk_data]   Package cess_cat is already up-to-date!


True

Load a corpus in Catalan or English. The nltk corpora result from tokenizing and segmenting into sentences large collections of text.

The ``gutenberg`` corpus comes from a set of English literature classics. The ``cess_cat`` corpus comes from https://www.cs.upc.edu/~nlp/wikicorpus/, the "120 Million Word Spanish Corpus" which has a subset in Catalan of 50 million words scrapped from Vikipedia in 2006.

In [49]:
name_corpus = 'gutenberg'

if name_corpus=='cess_cat':
    from nltk.corpus import cess_cat as corpus
    # clean the corpus of strange words
    words = []
    words_to_remove = ['*0*', '-Fpa-', '-Fpt-']
    for w in tqdm(corpus.words()):
        if w not in words_to_remove:
            words.append(w)

elif name_corpus=='gutenberg':
    from nltk.corpus import gutenberg as corpus
    print(corpus.fileids())
    words = corpus.words()
else:
    assert False

print('corpus {} : {} words, {} sentences'
      .format(name_corpus, len(words), len(corpus.sents())))

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
corpus gutenberg : 2621613 words, 98552 sentences


Build a language model from bigrams. A LM is just a dictionary
with key = condition = one word, and value = ``FreqDist``
object = another dictionary with key = next word, value = number
of occurrences.
This is adapted from https://www.nltk.org/book/ch02.html, section 2.4


In [None]:
grams = list(nltk.bigrams(words))
# also trigrams, ngrams, everygrams(max_len)
cfd = nltk.ConditionalFreqDist(grams)
print(cfd.conditions())
for i in [100, 200, 300, 400]:
    print(cfd.conditions()[i])
    print(cfd[cfd.conditions()[i]].most_common())
    print('--------------')

if name_corpus == 'cess_cat':
    freq_dist =cfd['Una']
else:
    freq_dist = cfd['The']

print(freq_dist.items())
print(freq_dist.max())
print(list(freq_dist.elements()))

Sample text from the language model

In [51]:
import random

def sample_bigram_model(cfd_bigrams, last_word, num_words=15):

    # TODO
    result = [last_word]
    for i in range(num_words):
      if last_word in cfd_bigrams:
        words = []
        freqs = []

        for word,freq in freq_dist.items():
          words.append(word)
          freqs.append(freq)

        selected_element = random.choices(words, weights=freqs, k=1)[0]
        result.append(selected_element)
        last_word = selected_element

      else:
        break

    return result

if name_corpus=='cess_cat':
    print(sample_bigram_model(cfd, 'El', 100))
    print(sample_bigram_model(cfd, 'La', 100))
    print(sample_bigram_model(cfd, 'Per', 100))
else:
    print(sample_bigram_model(cfd, 'The', 100))
    print(sample_bigram_model(cfd, 'For', 100))

['The', 'righteousness', 'weather', 'LORD', 'LORD', 'word', 'genius', 'burden', 'righteousness', 'strangers', 'woodman', 'little', 'troubadours', 'LORD', 'mere', 'count', 'WHITE', 'lambs', 'insolence', 'LORD', 'life', 'Emperor', 'landlord', 'king', 'room', 'sons', 'Paradise', 'preparing', 'great', 'two', 'felon', 'passage', 'undiminish', 'summer', 'inhabitants', 'woman', 'Master', 'true', 'word', 'Holy', 'car', 'likeness', 'convict', 'profit', 'cook', 'answer', 'first', 'woman', 'farther', 'sun', 'beans', 'Lord', 'Cat', 'morning', 'Master', 'magistrates', 'penalty', 'Pharisee', 'colloquy', 'priest', 'wake', 'grave', 'light', 'continual', 'land', 'whole', 'whole', 'birth', 'waters', 'strength', 'builder', 'little', 'LORD', 'doctor', 'wicked', 'sons', 'difference', 'three', 'effects', 'danger', 'secret', 'happier', 'banded', 'Miss', 'Country', 'Union', 'stranger', 'young', 'person', 'doors', 'first', 'General', 'law', 'nakedness', 'sun', 'same', 'tempting', 'smile', 'Father', 'rest', 'Ki

Extension of previous function to tri, 4... n-grams is long and complicated
because conditions of cfd are not one word but lists of pairs, triplets, n-1 words. In addition, the probability of not finding the previous 2, 3..n
generated words among the conditions (ngrams) is very high. So better rely
on the ``lm`` package of nltk. It has also support for adding ``<s>``, ``</s>`` symbols to sentences (padding), different types of smoothing and backoff, and sampling text.

Build a proper language model with support for ``<s>``, ``</s>``, smoothing, backoff, sampling and computation of perplexity. See how here
https://www.nltk.org/api/nltk.lm.html

In [52]:
if name_corpus=='cess_cat':
    text = []
    words_to_remove = ['*0*', '-Fpa-', '-Fpt-']
    #for s in tqdm(corpus.sents()[:1000]): # debug or quickly train the network
    for s in tqdm(corpus.sents()):
        new_s = [w for w in s if w not in words_to_remove]
        text.append(new_s[:-1]) # except ending point
else:
    text = []
    for s in tqdm(corpus.sents()):
        text.append(s[:-1]) # except ending point

from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm.models import MLE, Laplace, StupidBackoff

n = 3
# TODO:
# for each of the three types of language model in last import:
#     for n=3, 4, 5 (tri-grams, 4-grams, 5-grams)
#         create a model instance
#         pad sentences in text
#         train the model
#         sample a text with 100 words
# Hint: do like in ' '.join(['These', 'are', 'some', 'words'])
#
# Compare results, which combination seems more realistic ?


100%|██████████| 98552/98552 [00:07<00:00, 13778.31it/s]


In [76]:
num_sent = 5
for model_type in [ StupidBackoff,MLE, Laplace]:
  if model_type == MLE:
      print("MLE")
  elif model_type == Laplace:
      print("Laplace")
  else:
      print("Stupidbackoff")
  for n in [3, 4, 5]:
    sentences = []
    if model_type == StupidBackoff:
      lm = model_type(order = n)
    else:
      lm = model_type(n)
    train, vocab = padded_everygram_pipeline(n,text)
    lm.fit(train, vocab)

    for i in range(num_sent):
      content = []
      if i == 0:
        text_seed = ['<s>']
        generated_words = lm.generate(100,text_seed = text_seed,random_seed = 4)
        for word in generated_words:
          if word == '<s>':
            continue
          if word == '</s>':
            break
          content.append(word)
      else:
        text_seed = sentences[-1][-2:]
        generated_words = lm.generate(100,text_seed = text_seed,random_seed = 4)
        for word in generated_words:
          if word == '<s>':
            continue
          if word == '</s>':
            break
          content.append(word)

      sentences.append(content)

    print(f"For n = {n}:")
    result = ''
    for sentence in sentences:
      result +=' '.join(sentence)
      result +='. '
    print(result)
    print('-------------')








Stupidbackoff
For n = 3:
" I am God , which thou sawest ; Where he abides , Transfused on thee do some fishing himself. . 17 : 12 All the unaccomplished works of engineers , Our Willes and Fates do so quickly towards the door of the strange tower , perhaps. , a knop and a half was the son of Kareah , and a distant voice was now set , and see the suds he makes ready for rest , as Perry says that colds have been murdered with the look of real good - humoured notice of Cutler was a man indeed. . 
-------------
For n = 4:
17 : 12 And he took them the same fiery emotion accumulated within the Leyden jar of his own cottage with resolute steps , however , that his brother should take his wife , and thy redeemer , the Holy One of Israel , and the Inspector into another , while Mrs . Weston , how could you do so. : But I do assure you that you were born and did not I. a Benjamite , a man lefthanded : and by his power , and by her shewn to her mother , and she trembled till she shook the house ,