# Assignment 1

Using text http://www.gutenberg.org/files/2600/2600-0.txt
1. Make text lowercase and remove all punctuation except spaces and dots.
2. Tokenize text by BPE with vocab_size = 100
3. Train 3-gram language model with laplace smoothing $\delta=1$
4. Using beam search with k=10 generate sequences of length=10 conditioned on provided inputs. Treat dots as terminal tokens.
5. Calculate perplexity of the language model for the first sentence.

In [1]:
text = open('peace.txt', 'r').read()[2:]
len(text)

3227579

In [2]:
# def preprocess_text(text):
#     # make lowercase
#     # replace all punctuation with spaces
#     # collapse sequential spaces into one space '   ' -> ' '
#     return text

import string
import re

def preprocess_text(text):
    text = text.lower()
    for p in  "@<>!?\"#%'&\(\)'*+,-:;=\[\]^_`\{|\}~“”—\\/$’‘":
        text = text.replace(p , ' ')
#     text = re.sub(r"[@<>!?\"#%'&\(\)'*+,-:;=\[\]^_`\{|\}~“”—\\]+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text

text = preprocess_text(text)
assert len(text) == 3141169

In [3]:
text = text.split('.')
text = [x.strip() for x in text]

In [4]:
from collections import Counter
import nltk
from sklearn.base import TransformerMixin


class BPE(TransformerMixin):
    def __init__(self, vocab_size=100):
        super(BPE, self).__init__()
        self.vocab_size = vocab_size
        # index to token
        self.itos = []
        # token to index
        self.stoi = {}
        
    def fit(self, text):
        """
        convert text to a sequence of token ids
        """
        self.itos = Counter()
        for snt in text:
            self.itos.update(snt)
        self.itos = list(self.itos.keys())
        self.stoi = dict(zip(self.itos, range(len(self.itos))))
        text = [[self.stoi[x] for x in snt] for snt in text]
        
        while len(self.itos) < self.vocab_size:
            bi_counter = Counter()
            for snt in text:
                bi_counter.update(nltk.bigrams(snt))
            new_token = bi_counter.most_common()[0][0]
            new_id = len(self.itos)
            
            self.itos.append(new_token)
            self.stoi[new_token] = new_id
            
            result = []
            for snt in text:
                to_pass = False
                new_snt = []
                i = 0
                while i < len(snt)-1:
                    if (snt[i], snt[i+1]) == new_token:
                        new_snt.append(new_id)
                        i += 1
                    else:
                        new_snt.append(snt[i])
                    i += 1
                if i < len(snt):
                    new_snt.append(snt[i])
                result.append(new_snt)
                    
            text = result
            
        return self
    
    def transform(self, text):
        """
        convert text to a sequence of token ids
        """
        text = [[self.stoi[x] for x in snt] for snt in text]
        for tok_id, tok in enumerate(self.itos):
            result = []
            for snt in text:
                to_pass = False
                new_snt = []
                i = 0
                while i < len(snt)-1:
                    if (snt[i], snt[i+1]) == tok:
                        new_snt.append(tok_id)
                        i += 1
                    else:
                        new_snt.append(snt[i])
                    i += 1
                if i < len(snt):
                    new_snt.append(snt[i])
                result.append(new_snt)
            text = result
            
        return text
    
    def decode_token(self, tok):
        if isinstance(self.itos[tok], tuple):
            return ''.join(map(self.decode_token, self.itos[tok]))
        else:
            return self.itos[tok]
            
    def decode(self, text):
        """
        convert token ids into text
        """
        return ''.join(map(self.decode_token, text))
        
        
vocab_size = 100
bpe = BPE(vocab_size)
tokenized_text = bpe.fit_transform(text)

In [5]:
assert bpe.decode(tokenized_text[0]) == text[0]

In [6]:
import scipy.sparse as sp
import numpy as np
        
    
start_token = vocab_size
end_token = vocab_size + 1
        
class LM:
    def __init__(self, vocab_size, delta=1):
        self.delta = delta
        self.vocab_size = vocab_size + 2
        self.proba = np.zeros((self.vocab_size, self.vocab_size, self.vocab_size))
        
    def infer(self, a, b, tau=1):
        return np.array([self.get_proba(a, b, c, tau) for c in range(self.vocab_size)])
        
    def get_proba(self, a, b, c, tau=1):
        """
        get probability of 3-gram (a,b,c)
        """
        return ((self.proba[a,b,c] + self.delta) ** (1 / tau)) / (((self.proba[a,b, :] + self.delta) ** (1 / tau)).sum())
    
    def fit(self, text):
        """
        train language model on text
        """
        counter = Counter()
        for snt in text:
            counter.update(nltk.trigrams([start_token, start_token] + snt + [end_token]))
        for (a,b,c), n in counter.most_common():
            self.proba[a,b,c] += n
        
        return self
    
lm = LM(vocab_size, 1).fit(tokenized_text)

In [7]:
def beam_search(input_seq, lm, max_len=10, k=5, tau=1):
    """
    generate sequence from language model *lm* conditioned on input_seq
    """
    
    beam = [(input_seq, 0)]
    for i in range(max_len):
        candidates = []
        candidates_proba = []
        for snt, snt_proba in beam:
            if snt[-1] == end_token:
                candidates.append(snt)
                candidates_proba.append(snt_proba)
            else:    
                proba = lm.infer(snt[-2], snt[-1], tau=tau)
                best_k = np.argsort(-proba)[:k]

                for tok in best_k:
                    candidates.append(snt + [tok])
                    candidates_proba.append(snt_proba + np.log(proba[tok]))    
        best_candidates = np.argsort(-np.array(candidates_proba))[:k]
        beam = [(candidates[j], candidates_proba[j]) for j in best_candidates]
    return beam
    

In [8]:
input1 = 'horse '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
for snt, proba in result:
    if snt[-1] == end_token:
        snt = snt[:-1]
    print(bpe.decode(snt) + '|', round(proba, 4))
    

horse with a smill| -0.3796
horse was not been s| -3.0868
horse was sold not b| -3.8467
horse when said no| -3.9335
horse the cound him and | -4.3461
horse the countess mar| -4.3873
horse who had been s| -4.4334
horse with his heas| -4.764
horse with his fack| -4.8562
horse the counderstand w| -4.8941


In [9]:
input1 = 'her'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
for snt, proba in result:
    if snt[-1] == end_token:
        snt = snt[:-1]
    print(bpe.decode(snt) + '|', round(proba, 4))

here with a smil| -0.3141
here was not been | -2.9911
here was sold not | -3.5954
here when said n| -3.8679
here with his fac| -4.1906
here who had been | -4.3377
here was she was not | -4.4389
here was not seem| -4.6451
here with his hea| -4.6961
here with his hear| -4.8287


In [10]:
input1 = 'what'
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=1)
for snt, proba in result:
    if snt[-1] == end_token:
        snt = snt[:-1]
    print(bpe.decode(snt) + '|', round(proba, 4))

what| -3.7117
whated| -4.4793
whated to him| -8.4815
what theight| -9.7693
whated to himself| -10.9836
whated to himself s| -11.9375
whated to himself and | -12.0406
whated to himself i| -12.0635
what they with him | -12.1464
whated to himself he | -12.1607


In [11]:
input1 = 'gun '
input1 = bpe.transform([input1])[0]
result = beam_search(input1, lm, max_len=10, k=10, tau=0.1)
for snt, proba in result:
    if snt[-1] == end_token:
        snt = snt[:-1]
    print(bpe.decode(snt) + '|', round(proba, 4))

gun and with a smil| -1.4231
gun been said no| -1.8174
gun and said not be| -2.1749
gun and so mussion | -2.5174
gun but was not be| -3.2621
gun and so must been| -3.4557
gun said not been| -3.6806
gun and said not se| -3.7806
gun but was sold n| -3.9078
gun and so musside | -4.1063


In [12]:
def perplexity(snt, lm):
    snt = [start_token, start_token] + snt + [end_token]
    log_proba = np.mean([np.log(lm.get_proba(snt[i], snt[i+1], snt[i+2], tau=1)) for i in range(len(snt)-2)])
    return np.exp(- log_proba)

perplexity(tokenized_text[0], lm)

13.366222407779398