In [1]:
import nltk
import sys
from nltk.probability import ConditionalFreqDist
from itertools import chain
from collections import defaultdict
import numpy as np

### 1. First train a unigram language model using maximum likelihood estimation.

In [2]:
def unigram(file):
    # Find the total number of words in the file
    total_words = len(file)

    # Find the total number of DISTINCT words in the file
    total_distinct_words = len (set(w for w in file))
    
    # Count frequency over the words in the file
    freqs = {}
    for w in file:
        freqs[w] = freqs.get(w, 0) + 1
    
    # Get probability of unigram model
    prob = {k: v / total_words for k, v in freqs.items()}
   
    # Return the word counts
    return (total_words, total_distinct_words, freqs, prob)

train1 = 'AAABACBABBBCCACBCC'
unigram(train1)

{'A': 0.3333333333333333, 'B': 0.3333333333333333, 'C': 0.3333333333333333}

In [3]:
file = train1
(tokens,types,freqs,prob) = unigram(file)
print ('Total words in train1: %s' %(tokens))
print ('Total distinct words in train1: %s' %(types))
print ('The frequency of words in train1: %s' %(freqs))
print ('The probability of words in train1: %s' %(prob))

Total words in train1: 18
Total distinct words in train1: 3
The frequency of words in train1: {'A': 6, 'B': 6, 'C': 6}
The probability of words in train1: {'A': 0.3333333333333333, 'B': 0.3333333333333333, 'C': 0.3333333333333333}


### 2. Next train a bigram language model using maximum likelihood estimation.

In [4]:
def bigram(w,w0,file):
    p = 0.0

    file_w = [w for w in file]
    file_bigram = zip(file_w[:-1], file_w[1:])  # list of bigrams as tuples
    # (above doesn't include begin/end of corpus: but basically this is fine)
    file_bigram = (list(file_bigram))

    # Compute probability of w given w0
    p = float(file_bigram.count((w0, w))) / (file_w.count(w0))

    # Return probability
    # print ('The Probability of', w, 'given', w0, 'is', p)
    return p
    
train2 = 'AAABACBABBBCCACBCC$'
bigram('A', 'C', train2)

0.16666666666666666

### 3. Now evaluate your language models on the corpus

In [5]:
test1 = 'ABACABB'

def unigram_perplexity(test):
    n = len(test)
    
    # Count frequency over the words in the test file
    freqs = {}
    for w in test:
        freqs[w] = freqs.get(w, 0) + 1
        
    # Get probability of each word from the model trained on train1
    get_prob = unigram(train1)[3]
    
    # Calculate prob(letter) ** freqs of the letter
    result_dict = {}
    for w in freqs:
        if w in get_prob:
            result_dict[w] = get_prob[w]**freqs[w]
    
    # Calculate perplexity
    perplexity = 1
    for key in result_dict:    
        perplexity = perplexity * result_dict[key]
    
    perplexity = perplexity **(-1/n)
    
    return perplexity

unigram_perplexity(test1)


3.0

### Calculating perplexity on this test file will give an infinite because I haven't dealt with unknown words yet.

In [6]:
test2 = 'ABACABB$'

def bigram_complexity(test):
    n = len(test)
    
    # Find pairs of letters
    test_w = [w for w in test]
    test_bigram = zip(test_w[:-1], test_w[1:])
    test_bigram = (list(test_bigram))
    
    prob = []
    for w0, w in test_bigram:
        prob += [bigram(w,w0, train2)]
    prob = np.log(prob)
    prob = np.exp(sum(prob))
    perplexity = prob **(-1/n)
    
    return (perplexity)

bigram_complexity(test2)

  
  app.launch_new_instance()


inf

### 4. Now repeat everything above for add-1 (Laplace) smoothing.

In [11]:
def unigram_laplace(file):
    # Find the total number of words in the file
    total_words = len(file)

    # Find the total number of DISTINCT words in the file
    total_distinct_words = len (set(w for w in file))
    
    # Count frequency over the words in the file
    freqs = {}
    for w in file:
        freqs[w] = freqs.get(w, 0) + 1
    
    # Get probability of unigram model
    prob = {k: (v+1) / (total_words +total_distinct_words) for k, v in freqs.items()}
   
    # Return the word counts
    return (total_words, total_distinct_words, freqs, prob)

train1 = 'AAABACBABBBCCACBCC'
unigram_laplace(train1)

(18,
 3,
 {'A': 6, 'B': 6, 'C': 6},
 {'A': 0.3333333333333333, 'B': 0.3333333333333333, 'C': 0.3333333333333333})

In [19]:
def bigram_laplace(w,w0,file):
    p = 0.0

    file_w = [w for w in file]
    file_bigram = zip(file_w[:-1], file_w[1:])  # list of bigrams as tuples
    # (above doesn't include begin/end of corpus: but basically this is fine)
    file_bigram = (list(file_bigram))
    
    # Get the total number of different combination types
    v = len (set(x for x in file_w[:-1])) * len (set(x for x in file_w[1:]))

    # Compute probability of w given w0
    p = float(file_bigram.count((w0, w))+1) / (file_w.count(w0) + v )

    # Return probability
    # print ('The Probability of', w, 'given', w0, 'is', p)
    return p
    
train2 = 'AAABACBABBBCCACBCC$'
bigram_laplace('A', '$', train2)

0.07692307692307693

In [20]:
test1 = 'ABACABB'

def unigram_laplace_perplexity(test):
    n = len(test)
    
    # Count frequency over the words in the test file
    freqs = {}
    for w in test:
        freqs[w] = freqs.get(w, 0) + 1
        
    # Get probability of each word from the model trained on train1
    get_prob = unigram_laplace(train1)[3]
    
    # Calculate prob(letter) ** freqs of the letter
    result_dict = {}
    for w in freqs:
        if w in get_prob:
            result_dict[w] = get_prob[w]**freqs[w]
    
    # Calculate perplexity
    perplexity = 1
    for key in result_dict:    
        perplexity = perplexity * result_dict[key]
    
    perplexity = perplexity **(-1/n)
    
    return perplexity

unigram_laplace_perplexity(test1)


3.0

In [21]:
test2 = 'ABACABB$'

def bigram_laplace_complexity(test):
    n = len(test)
    
    # Find pairs of letters
    test_w = [w for w in test]
    test_bigram = zip(test_w[:-1], test_w[1:])
    test_bigram = (list(test_bigram))
    
    prob = []
    for w0, w in test_bigram:
        prob += [bigram_laplace(w,w0, train2)]
    prob = np.log(prob)
    prob = np.exp(sum(prob))
    perplexity = prob **(-1/n)
    
    return (perplexity)

bigram_laplace_complexity(test2)

5.788071779761857