In [3]:
import re
import os
import math
import numpy as np
import torch
import torch.nn as nn
import random
from sklearn.model_selection import train_test_split

In [2]:
def tokenize(text):
    # tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
    s = text
    s = re.sub('\#[a-zA-Z]\w+', '<HASHTAG>', s)
    s = re.sub(r'\S*[\w\~\-]\@[\w\~\-]\S*', r'<EMAIL>', s)
    s = re.sub(r'(https?:\/\/|www\.)?\S+[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\S+', r'<URL>', s)
    s = re.sub('@\w+', '<MENTION>', s)
    s = s.lower()
    s = re.sub('\[.*\s+-\s+.*\]', '<FOOTNOTE>', s)

    s = re.sub(r'(!|"|\#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|‘|\{|\||\}|~)\1{1,}', r'\1', s)
    s = re.sub(r'\d{2,4}\-\d\d-\d{2,4}|\d{2,4}\/\d\d\/\d{2,4}|\d{2,4}:\d\d:?\d{2,4}', '<DATE>', s)
    s = re.sub(r'\d+:\d\d:?\d{0,2}?( am|am| pm|pm)', r'<TIME>', s)
    s = re.sub(r'[\+0-9\-\(\)\.]{3,}[\-\.]?[0-9\-\.]{3,}', r'<MOB>', s)
    s = re.sub(r'(?<=\s)[\:\.]?\d*[\:\.]?\d*[\:\.]?(?=\s)', r'<NUM>', s)
    s = re.sub(r'(\w)\1{2,}', r'\1\1', s)
    s = re.sub(r'can\'t', r'can not', s)
    s = re.sub(r'won\'t', r'will not', s)
    s = re.sub(r'([a-zA-Z]+)-([a-zA-Z]+)', r'\1 \2', s)
    
    # Separating placeholders

    tokens = s.split()
    return tokens

def addStartEnd(tokens, n):
    for i in range(n):
        tokens.insert(0, "<start>")
        tokens.append("</start>")
    return tokens

In [3]:
def cleanCorpus(corpus, fileName):
    # print(corpus)
    # corpus = re.sub(r'http\S+', '<URL>', corpus)
    # corpus = re.sub(r'#\S+', '<HASHTAG>', corpus)
    # corpus = re.sub(r'@\S+', '<MENTION>', corpus)
    # corpus = corpus.lower()
    for i in range(len(corpus)): 
        s = corpus[i]   
        s = re.sub('\#[a-zA-Z]\w+', '<HASHTAG>', s)
        s = re.sub(r'\S*[\w\~\-]\@[\w\~\-]\S*', r'<EMAIL>', s)
        s = re.sub(r'(https?:\/\/|www\.)?\S+[a-zA-Z0-9]{2,}\.[a-zA-Z0-9]{2,}\S+', r'<URL>', s)
        s = re.sub('@\w+', '<MENTION>', s)
        s = s.lower()
        s = re.sub('\[.*\s+-\s+.*\]', '<FOOTNOTE>', s)
        s = re.sub(r'(!|"|\#|\$|%|&|\'|\(|\)|\*|\+|,|-|\.|\/|:|;|<|=|>|\?|@|\[|\\|\]|\^|_|‘|\{|\||\}|~)\1{1,}', r'\1', s)
        s = re.sub(r'\d{2,4}\-\d\d-\d{2,4}|\d{2,4}\/\d\d\/\d{2,4}|\d{2,4}:\d\d:?\d{2,4}', '<DATE>', s)
        s = re.sub(r'\d+:\d\d:?\d{0,2}?( am|am| pm|pm)', r'<TIME>', s)
        s = re.sub(r'[\+0-9\-\(\)\.]{3,}[\-\.]?[0-9\-\.]{3,}', r'<MOB>', s)
        s = re.sub(r'(?<=\s)[\:\.]?\d*[\:\.]?\d*[\:\.]?(?=\s)', r'<NUM>', s)
        s = re.sub(r'(\w)\1{2,}', r'\1\1', s)
        s = re.sub(r'can\'t', r'can not', s)
        s = re.sub(r'won\'t', r'will not', s)
        s = re.sub(r'([a-zA-Z]+)-([a-zA-Z]+)', r'\1 \2', s)
        corpus[i] = s 
   
    try: 
        os.mkdir('cleanCorpus')
    except:
        # do nothing
        pass
    
    newFile = 'cleanCorpus/' + fileName
    # delete file if it exists
    try:
        os.remove(newFile)
    except:
        # do nothing
        pass
    f = open(newFile, 'x')
    for i in range(len(corpus)):
        f.write(corpus[i])
    f.close()
    
    return corpus

In [4]:
fileName1 = 'Ulysses - James Joyce.txt'
fileName2 = 'Pride and Prejudice - Jane Austen.txt'


string2 = 'corpus/' + fileName1
with open(string2, 'r') as f:
    corpus1 = f.readlines()
corpus1 = cleanCorpus(corpus1, fileName1)

string4 = 'corpus/' + fileName2
with open(string4, 'r') as f:
    corpus2 = f.readlines()
corpus2 = cleanCorpus(corpus2, fileName2)

In [5]:
def perplexityScore(prob):

    # return perplexity score for list of probabilities
    sum = 0
    for p in prob:
        if p != 0:
            sum += math.log(p)
        else:
            sum += math.log(1e-10)

    return math.exp(-sum/len(prob))

In [6]:
def getFrequencyCount(trainSample):
    freqCountFullString = [{}, {}, {}, {}]
    unigramCount = {}
    bigramCount = {}
    trigramCount = {}
    quadgramCount = {}
    for sentence in trainSample:
        # print(sentence)
        tokens = tokenize(sentence)
        for token in tokens:
            if token not in unigramCount:
                unigramCount[token] = 1
            else:
                unigramCount[token] += 1
    
    temp = 0
    wordsToReplace = []
    for key, value in unigramCount.copy().items():
        if value < 10:
            temp += value
            unigramCount.pop(key)
            wordsToReplace.append(key)
    
    unigramCount["<unk>"] = temp

    print("Unknowns identified")

    for index in range(len(trainSample)):
        #print("Sentence " + str(index) + " of " + str(len(trainSample)))
        tokens = tokenize(trainSample[index])
        for i in range(len(tokens)):
            if tokens[i] in wordsToReplace:
                tokens[i] = "<unk>"
        trainSample[index] = ' '.join(tokens)
        
    freqCountFullString[0] = unigramCount
    
    print("Unknowns replaced")

    for index in range(len(trainSample)):
        sentence = trainSample[index]
        tokens = tokenize(sentence)
        tokens = addStartEnd(tokens, 3)
        
        for i in range(len(tokens) - 1):
            if tokens[i] not in bigramCount:
                bigramCount[tokens[i]] = {}
            if tokens[i + 1] not in bigramCount[tokens[i]]:
                bigramCount[tokens[i]][tokens[i + 1]] = 1
            else:
                bigramCount[tokens[i]][tokens[i + 1]] += 1
            
            currBigram = tokens[i] + " " + tokens[i + 1]
            
            if currBigram not in freqCountFullString[1]:
                freqCountFullString[1][currBigram] = 1
            else:
                freqCountFullString[1][currBigram] += 1

        
        # tokens = addStartEnd(tokens, 1)

        for i in range(len(tokens) - 2):
            if tokens[i] not in trigramCount:
                trigramCount[tokens[i]] = {}
            if tokens[i + 1] not in trigramCount[tokens[i]]:
                trigramCount[tokens[i]][tokens[i + 1]] = {}
            if tokens[i + 2] not in trigramCount[tokens[i]][tokens[i + 1]]:
                trigramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]] = 1
            else:
                trigramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]] += 1
            
            currTrigram = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2]

            if currTrigram not in freqCountFullString[2]:
                freqCountFullString[2][currTrigram] = 1
            else:
                freqCountFullString[2][currTrigram] += 1

        #tokens = addStartEnd(tokens, 1)

        for i in range(len(tokens) - 3):
            if tokens[i] not in quadgramCount:
                quadgramCount[tokens[i]] = {}
            if tokens[i + 1] not in quadgramCount[tokens[i]]:
                quadgramCount[tokens[i]][tokens[i + 1]] = {}
            if tokens[i + 2] not in quadgramCount[tokens[i]][tokens[i + 1]]:
                quadgramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]] = {}
            if tokens[i + 3] not in quadgramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]]:
                quadgramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]][tokens[i + 3]] = 1
            else:
                quadgramCount[tokens[i]][tokens[i + 1]][tokens[i + 2]][tokens[i + 3]] += 1
            
            currQuadgram = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2] + " " + tokens[i + 3]
            
            if currQuadgram not in freqCountFullString[3]:
                freqCountFullString[3][currQuadgram] = 1
            else:
                freqCountFullString[3][currQuadgram] += 1
        
        trainSample[index] = ' '.join(tokens)

    # print(trainSample)
    freqCount = [unigramCount, bigramCount, trigramCount, quadgramCount]
    return freqCount, freqCountFullString

def returnCount(string, freqCount):
    if(len(string) == 0):
        return 0
    lenString = len(string.split(" "))
    tokens = tokenize(string)
    try:
        if(lenString == 1):
            return freqCount[0][tokens[0]]
        elif(lenString == 2):
            return freqCount[1][tokens[0]][tokens[1]]
        elif(lenString == 3):
            return freqCount[2][tokens[0]][tokens[1]][tokens[2]]
        elif(lenString == 4):
            return freqCount[3][tokens[0]][tokens[1]][tokens[2]][tokens[3]]
    except KeyError:
        return 0

def returnPositiveCount(history, freqCount):
    retVal = 0
    n = len(history.split(" ")) + 1
    tokens = tokenize(history)

    try:
        if(n == 1):
            for key, val in freqCount[0].items():
                if(val > 0):
                    retVal += 1
        elif(n == 2):
            for key, val in freqCount[1][tokens[0]].items():
                if(val > 0):
                    retVal += 1
        elif(n == 3):
            for key, val in freqCount[2][tokens[0]][tokens[1]].items():
                if(val > 0):
                    retVal += 1
        elif(n == 4):
            for key, val in freqCount[3][tokens[0]][tokens[1]][tokens[2]].items():
                if(val > 0):
                    retVal += 1
    except:
        return 0
    return retVal

def returnSumCount(history, freqCount):
    retVal = 0
    n = len(history.split(" ")) + 1
    tokens = tokenize(history)
    try:
        if(n == 1):
            for key, val in freqCount[0].items():
                retVal += val
        elif(n == 2):
            for key, val in freqCount[1][tokens[0]].items():
                retVal += val
        elif(n == 3):
            for key, val in freqCount[2][tokens[0]][tokens[1]].items():
                retVal += val
        elif(n == 4):
            for key, val in freqCount[3][tokens[0]][tokens[1]][tokens[2]].items():
                retVal += val
    except:
        return 0
    return retVal

def continuationCount(fullFreqCount, history, str):
    n = len(history.split(" ")) + 1
    ans = 0
    for key, val in fullFreqCount[n - 1].items():
        # if key ends with str add 1
        if key.endswith(str):
            ans += 1
    return ans

In [19]:
def kneserNeySmoothing(freqCount, history, str, fullFreqCount):
    n = len(history.split(" ")) + 1
    fullString = history + " " + str    
    if(history == ""):
        n = 1
        fullString = str
    # print("value of n is ", n)
    countofString = returnCount(fullString, freqCount)
    if n == 1:
        if countofString == 0:
            return freqCount[0]["<unk>"] / sum(value for key, value in freqCount[0].items())
        else:
            return countofString / sum(value for key, value in freqCount[0].items())
    countofHistory = returnCount(history, freqCount)
    if n == 2:
        disFactor = 0.75
    elif n == 3:
        disFactor = 0.9
    
    try:
        lamb = disFactor * (returnPositiveCount(history, freqCount) / returnSumCount(history, freqCount))
    except:
        lamb = 0
        
    if n == 4:
        firstTermDenominator = countofHistory
        firstTermNumerator = max(0, countofString)
        lamb = 0
    else:
        firstTermDenominator = continuationCount(fullFreqCount, history, history)
        firstTermNumerator = max(0, continuationCount(fullFreqCount, history, history + " " + str) - disFactor)

    if firstTermDenominator == 0:
        # return 1/ len(freqCount[0].keys())
        return 0.75/ freqCount[0]["<unk>"]
    newHistory = " ".join(tokenize(history)[1:])
    # print("New History is ", newHistory)
    return firstTermNumerator / firstTermDenominator + lamb * kneserNeySmoothing(freqCount, newHistory, str, fullFreqCount)


In [20]:
def splitData(text):
    allLines = text
    testSample = random.sample(allLines, 1000)
    trainSample = [x for x in allLines if x not in testSample]
    # print(testSample)
    return trainSample, testSample


In [21]:
trainSample, testSample = splitData(corpus2)

In [22]:
freqCount, fullFreqCount = getFrequencyCount(trainSample)

Unknowns identified
Unknowns replaced


In [None]:
# get perplexity for each training sentence for a 4-gram model
perplexity = []
for sentence in trainSample:
    print("Sentence: ", sentence)
    tokens = tokenize(sentence)
    prob = []
    for i in range(len(tokens) - 3):
        history = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2]
        # print("history: ", history)
        prob.append(kneserNeySmoothing(freqCount, history, tokens[i + 3], fullFreqCount))
    perplexity.append(perplexityScore(prob))
    print("Perplexity is: %.3f" % perplexityScore(prob))
    print("")

In [24]:
# create vocabulary
vocab = []
for key, val in freqCount[0].items():
    if(val > 0):
        vocab.append(key)

vocab.append("<unk>")
vocab.append("<start>")
vocab.append("</start>")

In [None]:
# get perplexity for testing samples

perplexityTest = []
for line in testSample:
    tokens = tokenize(line)
    for index in range(len(tokens)):
        if(tokens[index] not in vocab):
            tokens[index] = "<unk>"
    tokens = addStartEnd(tokens, 3)

    print("Sentence: ", " ".join(tokens))

    prob = []
    for i in range(len(tokens) - 3):
        history = tokens[i] + " " + tokens[i + 1] + " " + tokens[i + 2]
        prob.append(kneserNeySmoothing(freqCount, history, tokens[i + 3], fullFreqCount))
    print("Perplexity is: %.3f" % perplexityScore(prob))
    print("")
    perplexityTest.append(perplexityScore(prob))
    print("Average perplexity is: %.3f" % (sum(perplexityTest) / len(perplexityTest)))