#### We will address some issues with our vanilla bigram model implementation One is out of vocabulary tokens and the other is bigrams that are never observed in the training data. We will add a special `<UNK>` token to our vocabulary to address out of vocabulary words. For the zero bigram count problem, we will explore some smoothing technique.  

In [40]:
import re
import random
from collections import defaultdict, Counter
import numpy as np

In [31]:
class bigram_LM():

    def __init__(self, count_threshold=2, k=0.1):
        self.count_threshold = count_threshold 
        self.k = k
        self.bigram_counts = None
        self.unigram_counts = None
        self.vocab = None
        self.word2idx = None
        self.bigram_probs = None
        self.num_sentences = None
        self.unk_token = '<UNK>'

    def train(self, sentences):
        self.num_sentences = len(sentences)
        self.vocab, self.unigram_counts, self.bigram_counts = self.get_counts(sentences)
        self.vocab = list(self.unigram_counts.keys())
        self.word2idx = {word:i for i,word in enumerate(self.vocab)}
        self.compute_probs()
        print("Training complete!")         

    def get_counts(self, sentences):
        # collect unigram counts 
        print("Collecting unigram counts...")
        unigram_counts = Counter()
        for s in sentences:
            for word in s:
                unigram_counts[word] += 1 
        
        # remove all words that have count below the threshold    
        print("Constructing vocab...")     
        for w in list(unigram_counts.keys()):
            if unigram_counts[w] < self.count_threshold:
                unigram_counts.pop(w)
        # construct vocab 
        vocab = [self.unk_token] + sorted(list(unigram_counts.keys()))            
        
        # replace all oov tokens in training sentences with <UNK>
        print("Replacing with oov tokens in training data...")
        sentences_unk = []
        for s in sentences:
            sent = []
            for word in s:
                if word in vocab:
                    sent.append(word)
                else:
                    sent.append(self.unk_token)
            sentences_unk.append(sent)            

        # re-collect unigram counts 
        print("Re-collecting unigram counts...")
        unigram_counts = Counter()
        for s in sentences_unk:
            for word in s:
                unigram_counts[word] += 1 

        # collect bigram counts
        print("Collecting bigram counts...")
        bigram_counts = Counter()
        for s in sentences_unk:
            for bigram in zip(s[:-1], s[1:]):
                bigram_counts[bigram] += 1     

        return vocab, unigram_counts, bigram_counts
    
    def compute_probs(self):
        print("Computing bigram probabilities...")
        bigram_probs = Counter()
        for word1 in self.vocab:
            probs = []
            for word2 in self.vocab:
                # compute P(word2|word1)
                p = self.bg_prob(word1, word2)
                probs.append(p)
            bigram_probs[word1] = probs 
        self.bigram_probs = bigram_probs   

    def bg_prob(self, word1, word2):
        # addk probability
        p = (self.bigram_counts[(word1, word2)] + self.k) / (self.unigram_counts[word1] + self.k*len(self.vocab)) 
        return p        

In [4]:
# prep the training data
with open('shakespeare.txt', 'r') as file:
    lines = file.readlines()

# remove all punctuations (except for apostrophe) and escape characters from the lines, lowercase all characters
sentences_clean = []
for line in lines:
    cleaned = re.sub(r"[^\w\s']",'',line).strip().lower()
    if len(cleaned) > 0:
        sentences_clean.append(cleaned)

# tokenize the sentences (split on whitespaces) and add start and end sentence tokens
start_token = '<s>'        
end_token = '</s>'        
sentences_tokenized = [[start_token]+s.split()+[end_token] for s in sentences_clean]
print(f"Num sentences: {len(sentences_tokenized)}")    

# now we split the data into train and test sentences
num_sent = len(sentences_tokenized)
num_test = int(0.1 * num_sent)
test_idx = random.sample(range(num_sent), num_test)

sentences_train = []
sentences_test = []
for i in range(num_sent):
    if i not in test_idx:
        sentences_train.append(sentences_tokenized[i])
    else:
        sentences_test.append(sentences_tokenized[i])    

print(f"Number of training sentences: {len(sentences_train)}")        
print(f"Number of test sentences: {len(sentences_test)}")        


Num sentences: 32777
Number of training sentences: 29500
Number of test sentences: 3277


In [32]:
model = bigram_LM()
model.train(sentences_train)

Collecting unigram counts...
Constructing vocab...
Replacing with oov tokens in training data...
Re-collecting unigram counts...
Collecting bigram counts...
Computing bigram probabilities...
Training complete!


In [33]:
def generate_text(model, n=10):
    current_word = '<s>'
    probs = []
    for word in model.vocab:
        probs.append(model.bigram_counts[(current_word, word)]/model.num_sentences)
    current_word = random.choices(model.vocab, weights=probs, k=1)[0]

    words = [current_word]

    sentences = []
    i = 0
    for i in range(n):
        while True:
            # get probabilities of next word given current context, i.e P(w|w_current)
            probs = model.bigram_probs[current_word]
            # now sample from the vocabulry according to this distribution
            next_word = random.choices(model.vocab, weights=probs, k=1)[0]
            if next_word == '</s>':
                break
            words.append(next_word)
            current_word = next_word
        if len(words) > 0:    
            sentences.append(" ".join(words))
        i += 1
        words = []     
        
    return "\n".join(sentences)   

In [38]:
model.k = 0.001
model.compute_probs()

Computing bigram probabilities...


In [39]:
text = generate_text(model, n=100)
print(text)

that theme rightly knees shall whiles i thank gentle norfolk
and practise out no
sleep
that cries aloud
instinct divines
lose him not endure her lord angelo have by action
is to make her father
be the stony stockings
bashful years to bear me to marry sir stephen scroop aboard a sudden
business
with a <UNK> tongue
may weep my heart and charity would
sell it was a bloody brow orchard peace' iron wit i
play as you shall have the common people mutinous and 'twere to taste as they come sir 'tis beauty from this
what's in thought
to bristol purpose dragon leg forgetfulness shall make your steed
index aloof news unusual topmast dispatch'd against thou dost unwillingly loves faithful finds visage cherish weeds
which i sup upon thy life him although apparent
statue paradise bug selfsame name
of york
be great anchors indirectly debt
it boots i had no cause shrift sulphurous fie fie upon your highness' shout
quarrels as my poor boy the great part exton gripe into a <UNK> them that my state surfei

#### Note that increaing the smoothing factor k will result in longer sentences being generated. This is because for larger k, the probability of the `</s>` token becomes smaller. 

In [41]:
def compute_perplexity(model, test_sentences):
    sum_log_probs = 0.0
    n = 0
    for s in test_sentences:
        for w1,w2 in zip(s[:-1], s[1:]):
            # replace any oov token with <UNK>
            if w1 not in model.vocab:
                w1 = model.unk_token    
            if w2 not in model.vocab:
                w2 = model.unk_token
            sum_log_probs += np.log(model.bg_prob(w1, w2))
            n += 1
    sum_log_probs *= (-1/n) 
    perplexity = np.exp(sum_log_probs)
    return perplexity  

In [44]:
# now lets compute perplexity on both the training and test data for different k values
kvals = [1.0, 0.1, 0.01, 0.001, 0.0001, 0.00001]
for k in kvals:
    model.k = k
    model.compute_probs()
    pp_train = compute_perplexity(model, sentences_train)
    pp_test = compute_perplexity(model, sentences_test)

    print(f"\nk = {k}")
    print(f"Perplexity computed on training set: {pp_train:.3f}")
    print(f"Perplexity computed on test set: {pp_test:.3f}")


Computing bigram probabilities...

k = 1.0
Perplexity computed on training set: 743.252
Perplexity computed on test set: 855.281
Computing bigram probabilities...

k = 0.1
Perplexity computed on training set: 213.299
Perplexity computed on test set: 383.594
Computing bigram probabilities...

k = 0.01
Perplexity computed on training set: 92.112
Perplexity computed on test set: 291.047
Computing bigram probabilities...

k = 0.001
Perplexity computed on training set: 62.757
Perplexity computed on test set: 351.426
Computing bigram probabilities...

k = 0.0001
Perplexity computed on training set: 56.286
Perplexity computed on test set: 550.344
Computing bigram probabilities...

k = 1e-05
Perplexity computed on training set: 55.361
Perplexity computed on test set: 934.920
