In [1]:
import numpy as np
from collections import Counter, defaultdict
import io, sys, math, re
import pandas as pd

In [2]:
class language_model:

    def __init__(self, ngram=2):
        self.ngram = ngram

    def unigram(self, text):
        
        data = {}
        words = [word for sentence in text for word in sentence.split()]
        word_length = len(set(words))
        self.c = Counter(words)

        for word in words:
            data[word] = (self.c[word] / word_length)
        return data, word_length
    
    def text_clean(self, text):
        
        text = text.strip('\n').lower()
        text = text.replace('\n', ' ')

        text = text.translate ({ord(c): "." for c in "!:?"})
        text = text.translate ({ord(c): "" for c in "\"\''""@#$%^&*()[]{};,/<>\|`~=_+"})
        text = text.translate ({ord(c): " " for c in "-"})

        sentences = ['<s> ' +" ".join(sentence.split())+ ' </s>' for sentence in text.split('.') if sentence != ""]
        return sentences

    def ngram_generation(self, sentence, n):
        
        tokens = [token for token in sentence.split(" ") if token != ""]
        ngrams = zip(*[tokens[i:] for i in range(n)])
        ngrams = [" ".join(ngram) for ngram in ngrams]
        ngrams = [tuple(sent.split()) for sent in ngrams]
        return ngrams

    
    def perplexity(self, data):   
        exp = [np.log2(v) for k,v in data.items()]
        exp = sum(exp) / len(exp)
        return np.power(2, -exp)


    def train(self, file_name) :
        with open(file_name, 'r', encoding="utf8") as f:
            text = f.read()
        clean_text = self.text_clean(text)

        if self.ngram == 1:
            self.uni_data, self.uni_count = self.unigram(clean_text)
            
            new_dict = {}
            for k,v in self.uni_data.items():
                tup = []
                tup = [tupl for tupl in k[0]]
                tup.append(k[0])
                new_dict[tuple(tup)] = v
            self.uni_data = new_dict
            self.uni_data_count = len(self.uni_data)
        pass
    

    def test(self, file_name) :
        with open(file_name, 'r', encoding="utf8") as f:
            test_text = f.read()
        clean_test_text = self.text_clean(test_text)
        test_dict = {}
        self.zero_count = 0
        self.sparsity = 0

        if self.ngram == 1:
            for sentence in clean_test_text:
                for word in sentence.split():
                    if word not in self.uni_data.keys():
                        test_dict[word] = 1/self.uni_count
                        self.zero_count += 1
                    else:
                        test_dict[word] = self.uni_data[word]
            if self.zero_count:
                self.sparsity = self.zero_count/len(test_dict)
        return self.perplexity(test_dict)


In [3]:
lm1= language_model(1)
lm1.train("ted.txt")
lm1.test("test.ted.txt")

53681.0000000428

In [4]:
lm1.train("ted.txt")
lm1.test("test.news.txt")

53680.99999999785

In [5]:
lm1.train("ted.txt")
lm1.test("test.reddit.txt")

53681.000000047825

# Part 3

In [6]:
def load_data(filename):
    fin = io.open(filename, 'r', encoding='utf-8')
    data = []
    vocab = defaultdict(lambda:0)
    for line in fin:
        sentence = line.split()
        data.append(sentence)
        for word in sentence:
            vocab[word] += 1
    return data, vocab

In [7]:
def remove_rare_words(data, vocab, mincount):
    data_with_unk = data[:]
    for i in range(len(data_with_unk)):
        for j in range(len(data_with_unk[i])):
            if vocab[data_with_unk[i][j]] < mincount:
                data_with_unk[i][j] = '<unk>'
    return data_with_unk

In [8]:
train_data, vocab = load_data("reddit.txt")
train_data = remove_rare_words(train_data, vocab, 5)
test_data, _ = load_data("test.ted.txt")
test_data = remove_rare_words(test_data, vocab, 5)

In [9]:
print("Training and testing dataset loaded.")

Training and testing dataset loaded.


In [10]:
def build_ngram(data, n):
    total_number_words = 0
    counts = defaultdict(lambda: defaultdict(lambda: 0.0))
    assert n >= 1, 'n should be greater than 1'
    for sentence in data:
        sentence = tuple(sentence)
        for gram_size in range(n):
            for idx in range(len(sentence)):
                total_number_words += 1.
                if gram_size+idx < len(sentence):
                    counts[sentence[idx:gram_size+idx]][sentence[idx+gram_size]] += 1.
    total_number_words /= n
    freq  = defaultdict(lambda: defaultdict(lambda: 0.0))
    for context in counts:
        for word in counts[context]:
            freq[context][word] = counts[context][word]/sum(counts[context].values())
    return freq

In [11]:
def get_prob(model, context, w):
    return model[context][w] if model[context][w] != 0.0 else 0.4*get_prob(model, context[1:],w)

def perplexity(model, data, n):
    perp = 0.0
    for sentence in data:
        sentence = tuple(sentence)
        probs = 0.0
        for idx in range(1,len(sentence)):
            probs += (-1.0/len(sentence))*np.log(get_prob(model, sentence[max(0,idx-n+1):idx], sentence[idx]))
        perp += probs/len(data)
    return np.exp(perp)

In [12]:
n = 3
print("build ngram model with n = ", n)
model = build_ngram(train_data, n)

build ngram model with n =  3


In [13]:
perplexity(model, test_data, n)

145.48119062006754

In [14]:
print("load training set")
train_data2, vocab2 = load_data("ted.txt")
train_data2 = remove_rare_words(train_data2, vocab2, 5)

load training set


In [15]:
print("load validation set")
test_data2, _ = load_data("test.ted.txt")
test_data2 = remove_rare_words(test_data2, vocab2, 5)

load validation set


In [16]:
n = 3
print("build ngram model with n = ", n)
model2 = build_ngram(train_data2, n)

build ngram model with n =  3


In [17]:
perplexity(model2, test_data2, n)

82.1259110330566

Part 4 Smoothing 

In [18]:
def smoothing(model, context, w):
    alpha=1
    n = max([len(key) for key in model])+alpha
    lambda_s = 1./n
    s = len(context)
    probs = 0.0
    for i in range(n):
        probs += lambda_s*get_prob(model, context[s-1-i:], w)
    return probs

def smooth_perplexity(model, data, n):
    perp = 0.0
    for sentence in data:
        sentence = tuple(sentence)
        probs = 0.0
        for idx in range(1,len(sentence)):
            probs += (-1.0/len(sentence))*np.log(smoothing(model, sentence[max(0,idx-n+1):idx], sentence[idx]))
        perp += probs/len(data)
    return np.exp(perp)

In [19]:
train_data_s, vocab_s = load_data("reddit.txt")
train_data_s = remove_rare_words(train_data_s, vocab_s, 5)
test_data_s, _ = load_data("test.ted.txt")
test_data = remove_rare_words(test_data_s, vocab_s, 5)

In [20]:
print("Training and testing dataset loaded")

Training and testing dataset loaded


In [21]:
n = 2
print("build ngram model with n = ", n)
model_s = build_ngram(train_data_s, n)

build ngram model with n =  2


In [22]:
smooth_perplexity(model_s, test_data_s, n)

114.17708228241057

In [23]:
train_data_s2, vocab_s2 = load_data("ted.txt")
train_data_s2 = remove_rare_words(train_data_s2, vocab_s2, 5)
test_data_s2, _ = load_data("test.ted.txt")
test_data_s2 = remove_rare_words(test_data_s2, vocab_s2, 5)

load training set


In [27]:
print("Training and testing dataset.")

Training and testing dataset.


In [25]:
n = 2
print("build ngram model with n = ", n)
model_s2 = build_ngram(train_data_s2, n)

build ngram model with n =  2


In [26]:
smooth_perplexity(model_s2, test_data_s2, n)

90.6521823804276