#### We will address some issues with our vanilla bigram model implementation One is out of vocabulary tokens and the other is bigrams that are never observed in the training data. We will add a special `<UNK>` token to our vocabulary to address out of vocabulary words. For the zero bigram count problem, we will explore some smoothing technique.  

In [1]:
import re
import random
from collections import defaultdict, Counter
import numpy as np

#### The first type of smoothing we explore is `add-k smoothing` for which the bi-gram probability estimate is modified as follows:

#### $P(w_k|w_{k-1}) = \frac{C(w_k, w_{k-1}) + k}{C(w_{k-1}) + k|V|}$ where $k$ is a positive constant.

This has the effect of redistributing the probability masses so that bigrams with zero count now have a non-zero probability. Also note that the factor of $k|V|$ in the denominator can cause a substantial decrease in the probabilities that were already non-zero before smoothing depending on how big $k$ is.


In [18]:
class bigram_LM_addk():

    def __init__(self, count_threshold=2, k=0.1):
        self.count_threshold = count_threshold 
        self.k = k
        self.bigram_counts = None
        self.unigram_counts = None
        self.vocab = None
        self.word2idx = None
        self.num_sentences = None
        self.unk_token = '<UNK>'

    def train(self, sentences):
        self.num_sentences = len(sentences)
        self.vocab, self.unigram_counts, self.bigram_counts = self.get_counts(sentences)
        self.vocab = list(self.unigram_counts.keys())
        self.word2idx = {word:i for i,word in enumerate(self.vocab)}
        print("Training complete!")         

    def get_counts(self, sentences):
        # collect unigram counts 
        print("Collecting unigram counts...")
        unigram_counts = Counter()
        for s in sentences:
            for word in s:
                unigram_counts[word] += 1 
        
        # remove all words that have count below the threshold    
        print("Constructing vocab...")     
        for w in list(unigram_counts.keys()):
            if unigram_counts[w] < self.count_threshold:
                unigram_counts.pop(w)
        # construct vocab 
        vocab = [self.unk_token] + sorted(list(unigram_counts.keys()))            
        
        # replace all oov tokens in training sentences with <UNK>
        print("Replacing with oov tokens in training data...")
        sentences_unk = []
        for s in sentences:
            sent = []
            for word in s:
                if word in vocab:
                    sent.append(word)
                else:
                    sent.append(self.unk_token)
            sentences_unk.append(sent)            

        # re-collect unigram counts 
        print("Re-collecting unigram counts...")
        unigram_counts = Counter()
        for s in sentences_unk:
            for word in s:
                unigram_counts[word] += 1
        print(f"Num unigrams: {len(unigram_counts)}")         

        # collect bigram counts
        print("Collecting bigram counts...")
        bigram_counts = Counter()
        for s in sentences_unk:
            for bigram in zip(s[:-1], s[1:]):
                bigram_counts[bigram] += 1     
        print(f"Num bigrams: {len(bigram_counts)}")         

        return vocab, unigram_counts, bigram_counts
    
    def compute_probs(self, word1):
        #print("Computing bigram probabilities...")
        probs = []
        for word2 in self.vocab:
            # compute P(word2|word1)
            p = self.bg_prob(word1, word2)
            probs.append(p)
        return probs   

    def bg_prob(self, word1, word2):
        # addk probability
        p = (self.bigram_counts[(word1, word2)] + self.k) / (self.unigram_counts[word1] + self.k*len(self.vocab)) 
        return p        

In [3]:
# prep the training data
with open('shakespeare.txt', 'r') as file:
    lines = file.readlines()

# remove all punctuations (except for apostrophe) and escape characters from the lines, lowercase all characters
sentences_clean = []
for line in lines:
    cleaned = re.sub(r"[^\w\s']",'',line).strip().lower()
    if len(cleaned) > 0:
        sentences_clean.append(cleaned)

# tokenize the sentences (split on whitespaces) and add start and end sentence tokens
start_token = '<s>'        
end_token = '</s>'        
sentences_tokenized = [[start_token]+s.split()+[end_token] for s in sentences_clean]
print(f"Num sentences: {len(sentences_tokenized)}")    

# now we split the data into train and test sentences
num_sent = len(sentences_tokenized)
num_test = int(0.1 * num_sent)
test_idx = random.sample(range(num_sent), num_test)

sentences_train = []
sentences_test = []
for i in range(num_sent):
    if i not in test_idx:
        sentences_train.append(sentences_tokenized[i])
    else:
        sentences_test.append(sentences_tokenized[i])    

print(f"Number of training sentences: {len(sentences_train)}")        
print(f"Number of test sentences: {len(sentences_test)}")        


Num sentences: 32777
Number of training sentences: 29500
Number of test sentences: 3277


In [12]:
model = bigram_LM_addk(k=0.01)
model.train(sentences_train)

Collecting unigram counts...
Constructing vocab...
Replacing with oov tokens in training data...
Re-collecting unigram counts...
Num unigrams: 6474
Collecting bigram counts...
Num bigrams: 77269
Training complete!


In [5]:
def generate_text(model, n=10):
    sentences = []
    i = 0
    for i in range(n):
        current_word = '<s>'
        words = []    
        while True:
            # get probabilities of next word given current context, i.e P(w|w_current)
            probs = model.compute_probs(current_word)
            # now sample from the vocabulry according to this distribution
            next_word = random.choices(model.vocab, weights=probs, k=1)[0]
            if next_word == '</s>':
                break
            if next_word == '<s>':
                continue    
            words.append(next_word)
            current_word = next_word
        if len(words) > 0:    
            sentences.append(" ".join(words))
        i += 1
         
        
    return "\n".join(sentences)   

In [19]:
text = generate_text(model, n=10)
print(text)

famous by the head
himself
first citizen
alas alas why that's somewhat doth he make me upon your eyes become a <UNK> 'tis bent of <UNK>
bore my lord
cunning and in this lord he is nothing is he throws not be too
he dies for he turns deadly venom
as lovers can not from heaven my joys with all will tell me
imprison'd in in a sudden mean
second keeper


#### Note that increaing the smoothing factor k will result in longer sentences being generated. This is because for larger k, the probability of the `</s>` token becomes smaller. 

In [16]:
def compute_perplexity(model, test_sentences):
    sum_log_probs = 0.0
    n = 0
    for s in test_sentences:
        for w1,w2 in zip(s[:-1], s[1:]):
            # replace any oov token with <UNK>
            if w1 not in model.vocab:
                w1 = model.unk_token    
            if w2 not in model.vocab:
                w2 = model.unk_token
            sum_log_probs += np.log(model.bg_prob(w1, w2))
            n += 1
    sum_log_probs *= (-1/n) 
    perplexity = np.exp(sum_log_probs)
    return perplexity  

In [17]:
# now lets compute perplexity on both the training and test data for different k values
kvals = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]
for k in kvals:
    model.k = k
    pp_train = compute_perplexity(model, sentences_train)
    pp_test = compute_perplexity(model, sentences_test)

    print(f"\nk = {k}")
    print(f"Perplexity computed on training set: {pp_train:.3f}")
    print(f"Perplexity computed on test set: {pp_test:.3f}")



k = 1.0
Perplexity computed on training set: 734.551
Perplexity computed on test set: 849.244

k = 0.1
Perplexity computed on training set: 211.417
Perplexity computed on test set: 384.699

k = 0.01
Perplexity computed on training set: 91.660
Perplexity computed on test set: 293.837

k = 0.001
Perplexity computed on training set: 62.647
Perplexity computed on test set: 356.312

k = 0.0001
Perplexity computed on training set: 56.270
Perplexity computed on test set: 560.490

k = 1e-05
Perplexity computed on training set: 55.360
Perplexity computed on test set: 957.605


#### Note that the best perpexlixty on the test set seems to be ~290.

#### Now we will try a different type of smoothing which interpolates between bigram, unigram and zerogram probabilities (zerogram probability is defined as just 1/|V|) in the following way:

$\hat{P}(w_k|w_{k-1}) = \lambda_2 P(w_k|w_{k-1}) + \lambda_1 P(w_k) + \lambda_0 P(0)$

where $P(w_k|w_{k-1}) = \frac{C(w_k, w_{k-1})}{C(w_{k-1})}$, $P(w_k) = \frac{C(w_k)}{\sum_{w \in V} C(w)}$ and $P(0) = \frac{1}{|V|}$

and $\lambda_0$, $\lambda_1$, $\lambda_2$ are constant interpolation weights which sum to 1 and whose values must be chosen such that the performance of the model on a held out test set is maximised. 




In [23]:
class bigram_LM_interp():

    def __init__(self, count_threshold=2, lmda = [0.01, 0.4, 0.59]):
        self.count_threshold = count_threshold 
        self.lmda = lmda
        self.bigram_counts = None
        self.unigram_counts = None
        self.vocab = None
        self.word2idx = None
        self.total_tokens = None
        self.unk_token = '<UNK>'

    def train(self, sentences):
        self.vocab, self.unigram_counts, self.bigram_counts, self.total_tokens = self.get_counts(sentences)
        self.vocab = list(self.unigram_counts.keys())
        self.word2idx = {word:i for i,word in enumerate(self.vocab)}
        print("Training complete!")         

    def get_counts(self, sentences):
        # collect unigram counts 
        print("Collecting unigram counts...")
        unigram_counts = Counter()
        for s in sentences:
            for word in s:
                unigram_counts[word] += 1 
        
        # remove all words that have count below the threshold    
        print("Constructing vocab...")     
        for w in list(unigram_counts.keys()):
            if unigram_counts[w] < self.count_threshold:
                unigram_counts.pop(w)
        # construct vocab 
        vocab = [self.unk_token] + sorted(list(unigram_counts.keys()))            
        
        # replace all oov tokens in training sentences with <UNK>
        print("Replacing with oov tokens in training data...")
        sentences_unk = []
        for s in sentences:
            sent = []
            for word in s:
                if word in vocab:
                    sent.append(word)
                else:
                    sent.append(self.unk_token)
            sentences_unk.append(sent)            

        # re-collect unigram counts 
        print("Re-collecting unigram counts...")
        unigram_counts = Counter()
        total_tokens = 0
        for s in sentences_unk:
            for word in s:
                unigram_counts[word] += 1 
                total_tokens += 1
        print(f"Num unigrams: {len(unigram_counts)}")        

        # collect bigram counts
        print("Collecting bigram counts...")
        bigram_counts = Counter()
        for s in sentences_unk:
            for bigram in zip(s[:-1], s[1:]):
                bigram_counts[bigram] += 1     
        print(f"Num bigrams: {len(bigram_counts)}")        

        return vocab, unigram_counts, bigram_counts, total_tokens

    def compute_probs(self, word1):
        #print("Computing bigram probabilities...")
        probs = []
        for word2 in self.vocab:
            # compute P(word2|word1)
            p = self.bg_prob(word1, word2)
            probs.append(p)
        return probs   

    def bg_prob(self, word1, word2):
        # linearly interpolated probability
        p_zerogram = self.lmda[0] * 1 / len(self.vocab)
        p_unigram =  self.lmda[1] * self.unigram_counts[word2] / self.total_tokens 
        p_bigram = self.lmda[2] * self.bigram_counts[(word1, word2)] / self.unigram_counts[word1] 
        p = p_zerogram + p_unigram + p_bigram
        return p        

In [24]:
model = bigram_LM_interp()
model.train(sentences_train)

Collecting unigram counts...
Constructing vocab...
Replacing with oov tokens in training data...
Re-collecting unigram counts...
Num unigrams: 6474
Collecting bigram counts...
Num bigrams: 77269
Training complete!


In [25]:
text = generate_text(model, n=100)
print(text)

performed by the so
'tis to tame you and is my lord
if you and he bloody
thine that i
and apprehend thee arms and thy not so well pleasing tongue milk your <UNK> a week and mannerly <UNK>
covert
gaze his i say <UNK> measure still thou in thy that unhappy brother' let mowbray highest
coriolanus
follow satisfied
first yet thy dry to the north
so
you
<UNK> bianca practise his mercy you <UNK> <UNK>
alonso tell who but if you not so much better ere service for't he be <UNK>
lute
lions that my true
attend
was this
raise up
this young and to this same
her
form fain
rosaline meet
the happy days buried in the keys there speak <UNK> him go ask him to be considered
<UNK>
godden i can behold you hence her i am content content
grace but knaves live
here master
<UNK> war
no bigger women are ruled <UNK> keeps
friends as your honour my was against thy invite
biondello
is at
though angel doth stand was made
that
all on nay faith then been too have think'st more too <UNK>
juliet bleeding slaughter thoug

In [26]:
# now lets compute perplexity on both the training and test data for different lambda values (lambda_0 will be held fixed at 0.01)
lambda2_vals = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
for l2 in lambda2_vals:
    model.lmda = [0.01, 0.99-l2 ,l2]
    pp_train = compute_perplexity(model, sentences_train)
    pp_test = compute_perplexity(model, sentences_test)

    print(f"\nlambdas = {model.lmda}")
    print(f"Perplexity computed on training set: {pp_train:.3f}")
    print(f"Perplexity computed on test set: {pp_test:.3f}")



lambdas = [0.01, 0.49, 0.5]
Perplexity computed on training set: 84.996
Perplexity computed on test set: 200.927

lambdas = [0.01, 0.39, 0.6]
Perplexity computed on training set: 76.123
Perplexity computed on test set: 197.657

lambdas = [0.01, 0.29000000000000004, 0.7]
Perplexity computed on training set: 69.202
Perplexity computed on test set: 199.101

lambdas = [0.01, 0.18999999999999995, 0.8]
Perplexity computed on training set: 63.654
Perplexity computed on test set: 207.434

lambdas = [0.01, 0.08999999999999997, 0.9]
Perplexity computed on training set: 59.127
Perplexity computed on test set: 231.764

lambdas = [0.01, 0.040000000000000036, 0.95]
Perplexity computed on training set: 57.178
Perplexity computed on test set: 265.303


#### Note that with interpolation, we get much lower perplexity on the test set compared to add-k smoothing. The best value is ~190. The quality of the generated text also seems to be slightly better, but that's hard to tell for sure.

In [64]:
model.lmda = [0.01, 0.99-0.8 ,0.8]
model.compute_probs()

Computing bigram probabilities...


In [65]:
text = generate_text(model, n=100)
print(text)

the precious jewel strong purpose not exempt in thy rocky bosom of the maid hath banish'd haughty mind
and all
that princely knee rise we marry i think but a man life
your subject made disgraced <UNK> <UNK> will of virtue
it will by this while a <UNK> <UNK> then shepherd
sir there
escalus
servant
my memory of prompt my have forgot
prince
thy loss well he you
he shall i are a concealment
find a consul
to thrust myself
these other home
and
you
whose feeling but we have knowledge find love
petruchio
nay good brother i shall be there to us
some pretty i' faith the maid's mild entreaty shall wear the high'st my <UNK>
no is well that moving
sir
what say'st thou take this
broke off send tybalt's doomsday is
and to god on and sir king richard moe
beg starve
where's barnardine partial to <UNK>
would <UNK> night
but till he that warwick's daughter is but
northumberland
corioli wear their king usurping him but this there brother die to pass
all kneel for exile him mistress and your hand that i kn

#### Let's now train using the NLTK brown corpus

In [7]:
from nltk.corpus import brown

# get pre-tokenized sentences
sentences = list(brown.sents())

# make everything lowercase and add start and end tokens
start_token = '<s>'        
end_token = '</s>'
sentences_tokenized = [[start_token]+[w.lower() for w in s]+[end_token] for s in sentences]

# now we split the data into train and test sentences
num_sent = len(sentences_tokenized)
num_test = int(0.1 * num_sent)
test_idx = random.sample(range(num_sent), num_test)

sentences_train = []
sentences_test = []
for i in range(num_sent):
    if i not in test_idx:
        sentences_train.append(sentences_tokenized[i])
    else:
        sentences_test.append(sentences_tokenized[i])    

print(f"Number of training sentences: {len(sentences_train)}")        
print(f"Number of test sentences: {len(sentences_test)}")   

Number of training sentences: 51606
Number of test sentences: 5734


In [8]:
model = bigram_LM_addk(k=0.001)
model.train(sentences_train)

Collecting unigram counts...
Constructing vocab...
Replacing with oov tokens in training data...
Re-collecting unigram counts...
Collecting bigram counts...
Computing bigram probabilities...


KeyboardInterrupt: 

In [None]:
text = generate_text(model, n=100)
print(text)