In [None]:
! pip install wget
import wget

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9681 sha256=e7f68b6a3ae56becdb04c1eb65e5be8196b33f10cc62d9964c2bde393950a599
  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [None]:
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/moby_dick.txt'
wget.download(url, 'moby_dick.txt')

'moby_dick.txt'

# Language Models

Let's start with a simple, Laplace-smoothed trigram model:

In [None]:
from collections import defaultdict
import numpy as np
import nltk

smoothing = 0.001
START = '_***_'
STOP = '_STOP_'

# map from (u, v) to w = (w|u,v)
counts = defaultdict(lambda: defaultdict(lambda: smoothing))

# fit data on corpus
corpus = [line.strip().split() for line in open('moby_dick.txt')]

# collect counts for MLE
for sentence in corpus:
    # include special tokens for start and the end of sentence
    tokens = [START, START] + sentence + [STOP]
    for u, v, w in nltk.ngrams(tokens, 3):
        counts[(u, v)][w] += 1

def logP(u, v, w):
    """
    compute the log probability of a trigram
    (u,v,w) => P(w|u,v) = c(u,v,w) / SUM(c(u,v,*))
    """
    return np.log(counts[(u, v)][w]) - np.log(sum(counts[(u, v)].values()))

def sentence_logP(S):
    """
    score a sentence in log likelihood with chain rule
    S: list(str)
    """
    tokens = [START, START] + S + [STOP]
    return sum([logP(u, v, w) for u, v, w in nltk.ngrams(tokens, 3)])

In [None]:
sum(counts[('because','they')].values())

4.004

We can now score arbitrary sentences:

In [None]:
sentence_logP('Captain Ahab is a man .'.split())

-27.92672048112014

In [None]:
sentence_logP('Captain Ahab is a woman .'.split())

-32.49967973437645

In [None]:
counts[('you','are')]

In [None]:
sum(counts[('you','are')].values())

## Activity
Implement the perplexity measure for a given corpus, and try it with two LM with different smoothing parameters.

$$perplexity = 2^{-\sum_{x \in X} p(x) \log p(x)}$$

In [None]:
def get_perplexity(corpus):
    """
    perplexity = 2^-entropy(X)
    entropy = -sum(p(x) *log(p(x)))
    """
    entropy = 0.0
    for sentence in corpus:
        sentence_log_prob = sentence_logP(sentence)
        sentence_entropy = np.exp(sentence_log_prob) * sentence_log_prob
        entropy += sentence_entropy
        
    perplexity = 2 ** -entropy
    return perplexity

print(get_perplexity(corpus))

4.118431256864183


## Generation

We can re-use the counts to generate language:

In [None]:
def generate():
    result = [START, START]
    next_word = sample_next_word(result[-2], result[-1])
    result.append(next_word)
    while next_word != STOP:
        next_word = sample_next_word(result[-2], result[-1])
        result.append(next_word)
    
    return ' '.join(result[2:-1])




def sample_next_word(u, v):
    """
    sample a word w based on the history (u, v)
    """
    # separate word and their counts into separate variables
    keys, values = zip(*counts[(u, v)].items())
    
    # normalize the counts into a probability distribution
    values = np.array(values)
    values /= values.sum() # create probability distro
    
    # this is the meat of the function
    sample = np.random.multinomial(1, values) # pick one position
    
    return keys[np.argmax(sample)]

In [None]:
keys, values = zip(*counts[('you','are')].items())
values = np.array(values)
values /= values.sum()
values

array([0.10339363, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.10339363, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751,
       0.03448751, 0.03448751, 0.03448751, 0.03448751, 0.03448751])

In [None]:
sample = np.random.multinomial(1, values)
sample

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0])

In [None]:
keys[np.argmax(sample)]

'skylarking'

In [None]:
for i in range(50):
    keys, values = zip(*counts[('you','are')].items())
    values = np.array(values)
    values /= values.sum()
    sample = np.random.multinomial(1, values)
    print(keys[np.argmax(sample)])

quick
only
just
quick
in
struck
determined
all
.
but
pitched
mistaken
experienced
.
pitched
telling
in
in
all
.
struck
just
but
experienced
experienced
now
that
eating
determined
experienced
in
now
telling
an
determined
in
in
dead
all
dead
quick
all
in
just
that
struck
.
but
,
all


In [None]:
sample_next_word('as', 'a'), counts[('as', 'a')]

('general',
 defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'Commodore': 1.001,
              'Dish': 1.001,
              'Latin': 1.001,
              'Roman': 1.001,
              'backwoodsman': 1.001,
              'bat': 1.001,
              'birch': 1.001,
              'body': 2.001,
              'candidate': 1.001,
              'cat': 1.001,
              'civilized': 1.001,
              'clam': 1.001,
              'clock': 1.001,
              'coffin': 1.001,
              'conceited': 1.001,
              'cook': 1.001,
              'corpse': 1.001,
              'country': 1.001,
              'cricket': 1.001,
              'crucible': 1.001,
              'dead': 1.001,
              'dinnerless': 1.001,
              'dragon': 1.001,
              'drawing': 1.001,
              'dromedary': 1.001,
              'fin': 1.001,
              'flavorish': 1.001,
              'fly': 1.001,
              'foreshadowing': 1.001,
              

We can now generate non-sensical sentences:

In [None]:
print(generate())

" Yes , Captain Sleet , entitled " The ship !


## Exercise

Modify generate to take any number of initial words.

In [None]:
def generate_any(initial):
    initial = initial.split()
    result = [START, START] + initial
    next_word = sample_next_word(result[-2], result[-1])
    result.append(next_word)
    while next_word != STOP:
        next_word = sample_next_word(result[-2], result[-1])
        result.append(next_word)
    
    return ' '.join(result[2:-1])

print(generate_any('The whale was'))

The whale was almost intolerable , it thenceforth becomes a sort of a recently concluded repast , turned , and muttered : " A Whaling Voyage to Spitzbergen in the bows of the basement of his ivory limb having been inflicted ; now for the great White Whale , that gorge is in request among jewellers and watchmakers .


In [None]:
print(generate_any('I want'))

I want John .'


In [None]:
print(generate_any('I will do that'))

I will do that last office for the blubber .


In [None]:
print(generate_any('I'))

I ' ll go lunging presently .


In [None]:
print(generate_any(''))

But all in all what mood you are that will drive us on .


## Exercise

Extend the code above to arbitray $n$-gram sizes. Use another corpus to try it with $n=4$.

It might be helpful to use a `class` for the LM, make the smoothing a parameter, `counts` a class property, and add a function `fit()`.

In [None]:
# Your code here


In [None]:
import wget
url = 'https://raw.githubusercontent.com/dirkhovy/NLPclass/master/data/tweets_en.txt'
wget.download(url, 'tweets_en.txt')
tweets = [line.strip() for line in open('tweets_en.txt', encoding='utf8')]
 
lm = LM(smoothing=0.001, n_grams_size=4)
lm.fit(document=tweets)
print(np.unique([lm.generate(["Trump","should","think","about"]) 
 for _ in range(10)]))