# Word prediction using Quadgram 
### Knesser-Ney Smoothing Used with Interpolation
### Time Complexity for word prediction only: O(1)



## <u>Import corpus</u>

In [2]:
from nltk.util import ngrams
from collections import defaultdict
from collections import OrderedDict
import string
import time
import gc
from math import log10
start_time = time.time()

## <u>Do preprocessing</u>:

### Remove the punctuations and lowercase the tokens

In [3]:
#returns: string
#arg: string
#remove punctuations, change to lowercase ,retain the apostrophe mark
def removePunctuations(sen):
    #split the string into word tokens
    temp_l = sen.split()
    #print(temp_l)
    i = 0
    j = 0
    
    #changes the word to lowercase and removes punctuations from it
    for word in temp_l :
        j = 0
        #print(len(word))
        for l in word :
            if l in string.punctuation:
                if l == "'":
                    if j+1<len(word) and word[j+1] == 's':
                        j = j + 1
                        continue
                word = word.replace(l," ")
                #print(j,word[j])
            j += 1

        temp_l[i] = word.lower()
        i=i+1   

    #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
    #become "here so"   
    content = " ".join(temp_l)

    return content

### Tokenize and load the corpus data


In [None]:
#returns : void
#arg: string,dict,dict,dict,dict
#loads the corpus for the dataset and makes the frequency count of quadgram ,bigram and trigram strings
def loadCorpus(file_path, bi_dict, tri_dict, quad_dict, vocab_dict):

    w1 = ''    #for storing the 3rd last word to be used for next token set
    w2 = ''    #for storing the 2nd last word to be used for next token set
    w3 = ''    #for storing the last word to be used for next token set
    token = []
    #total no. of words in the corpus
    word_len = 0

    #open the corpus file and read it line by line
    with open(file_path,'r') as file:
        for line in file:

            #split the string into word tokens
            temp_l = line.split()
            i = 0
            j = 0
            
            #does the same as the removePunctuations() function,implicit declratation for performance reasons
            #changes the word to lowercase and removes punctuations from it
            for word in temp_l :
                j = 0
                #print(len(word))
                for l in word :
                    if l in string.punctuation:
                        if l == "'":
                            if j+1<len(word) and word[j+1] == 's':
                                j = j + 1
                                continue
                        word = word.replace(l," ")
                        #print(j,word[j])
                    j += 1

                temp_l[i] = word.lower()
                i=i+1   

            #spliting is being done here beacause in sentences line here---so after punctuation removal it should 
            #become "here so"   
            content = " ".join(temp_l)

            token = content.split()
            word_len = word_len + len(token)  

            if not token:
                continue

            #add the last word from previous line
            if w3!= '':
                token.insert(0,w3)

            temp0 = list(ngrams(token,2))

            #since we are reading line by line some combinations of word might get missed for pairing
            #for trigram
            #first add the previous words
            if w2!= '':
                token.insert(0,w2)

            #tokens for trigrams
            temp1 = list(ngrams(token,3))

            #insert the 3rd last word from previous line for quadgram pairing
            if w1!= '':
                token.insert(0,w1)

            #add new unique words to the vocaulary set if available
            for word in token:
                if word not in vocab_dict:
                    vocab_dict[word] = 1
                else:
                    vocab_dict[word]+= 1
                  
            #tokens for quadgrams
            temp2 = list(ngrams(token,4))

            #count the frequency of the bigram sentences
            for t in temp0:
                sen = ' '.join(t)
                bi_dict[sen] += 1

            #count the frequency of the trigram sentences
            for t in temp1:
                sen = ' '.join(t)
                tri_dict[sen] += 1

            #count the frequency of the quadgram sentences
            for t in temp2:
                sen = ' '.join(t)
                quad_dict[sen] += 1


            #then take out the last 3 words
            n = len(token)

            #store the last few words for the next sentence pairing
            if (n -3) >= 0:
                w1 = token[n -3]
            if (n -2) >= 0:
                w2 = token[n -2]
            if (n -1) >= 0:
                w3 = token[n -1]
    return word_len

In [4]:
def loadCorpus1(bi_dict,tri_dict,quad_dict,vocab_dict):
    token_len = 0
    #load bigrams
    with open('w2_.txt','r',encoding='ISO-8859-1') as file:
            #each line contains first the frequency then the sentence
            for line in file:
                #split the line into tokens
                tokens = line.split()
                #set bigram dict values
                
                #set value if already not present
                if ' '.join(tokens[1:]) not in bi_dict:
                    bi_dict[' '.join(tokens[1:])] = int(tokens[0])
                    
                #check if there is any new word or not
                for word in tokens[1:]:
                    if word not in vocab_dict:
                        vocab_dict[word] = 1
                    else:
                        vocab_dict[word] += 1
                token_len += 2
                        
    #load trigrams
    with open('w3_.txt','r',encoding='ISO-8859-1') as file:
            #each line contains first the frequency then the sentence
            for line in file:
                #split the line into tokens
                tokens = line.split()
                #set trigram dict values
                
                #set value if already not present
                if ' '.join(tokens[1:]) not in tri_dict:
                    tri_dict[' '.join(tokens[1:])] = int(tokens[0])
                    
                #check if there is any new word or not
                for word in tokens[1:]:
                    if word not in vocab_dict:
                        vocab_dict[word] = 1
                    else:
                        vocab_dict[word] += 1
                token_len += 3
                
    #load quadgrams
    with open('w4_.txt','r',encoding='ISO-8859-1') as file:
            #each line contains first the frequency then the sentence
            for line in file:
                #split the line into tokens
                tokens = line.split()
                #set quadgram dict values
                
                #set value if already not present
                if ' '.join(tokens[1:]) not in quad_dict:
                    quad_dict[' '.join(tokens[1:])] = int(tokens[0])
                    
                #check if there is any new word or not
                for word in tokens[1:]:
                    if word not in vocab_dict:
                        vocab_dict[word] = 1
                    else:
                        vocab_dict[word] += 1
                token_len += 4
    
    print(len(bi_dict),len(tri_dict),len(quad_dict))
    return token_len

### Create a Hash Table for Probable words for Trigram sentences

In [None]:
#returns: void
#arg: dict,dict,dict,dict,dict,int,list
#creates dict for storing probable words with their probabilities for a trigram sentence
def findQuadgramProb(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict):
    print(len(vocab_dict),len(bi_dict),len(tri_dict),len(quad_dict))
    i = 0
    V = len(vocab_dict)
    #NOTE: reading from a file rather than the dict itself saves around 100mb of RAM space
    #with open('quad_dict.txt','r') as file:
        #for quad_sen in file:
    for quad_sen in quad_dict:
        quad_token = quad_sen.split()

        #trigram sentence for key
        tri_sen = ' '.join(quad_token[:3])

        #find the probability
        #add i smoothing has been used
        prob = ( quad_dict[quad_sen] + 1 ) / ( tri_dict[tri_sen] + V)

        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        #i += 1
    print('Quad_Prb_dict len:',len(quad_prob_dict))
    #print('i:',i)
    prob = None
    quad_token = None
    tri_sen = None

### Sort the probable words

In [5]:
#returns: void
#arg: dict
#for sorting the probable word acc. to their probabilities
def sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict):
   
    for key in quad_prob_dict:
        if len(quad_prob_dict[key])>1:
            #only at most top 2 most probable words have been taken
            quad_prob_dict[key] = sorted(quad_prob_dict[key],reverse = True)[:2]

### For writing the Quad_Dict values to a file

In [None]:
#returns: void
#arg: dict
#for writing the contents of quad_dict to a text file
def writeQuads(bi_dict,tri_dict,quad_dict):
    with open('quad_dict.txt','w') as file:
        for quad in quad_dict:
            file.write(quad+'\n')
    with open('tri_dict.txt','w') as file:
        for tri in tri_dict:
            file.write(tri+'\n')
    with open('bi_dict.txt','w') as file:
        for bi in bi_dict:
            file.write(bi+'\n')

In [6]:
#for writing prob dictionary values to file of quad,tri and bi dicts
def writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict):
    with open('quad_prob_dict_KN.txt','w') as file:
        for quad in quad_prob_dict:
            file.write( quad + ": " + str(quad_prob_dict[quad]) + '\n\n')
   

In [None]:
#for creating prob dict for trigram probabilities
def findTrigramProb(vocab_dict, bi_dict, tri_dict, tri_prob_dict):
    print(len(vocab_dict),len(bi_dict),len(tri_dict))
    #vocabulary length
    V = len(vocab_dict)
    
    #create a dictionary of probable words with their probabilities for
    #trigram probabilites,key is a bigram and value is a list of prob and word
    #with open('tri_dict.txt','r') as file:
    for tri in tri_dict:
    #for tri in file:
        tri_token = tri.split()
        #bigram sentence for key
        bi_sen = ' '.join(tri_token[:2])
        #find the probability

        #add i smoothing has been used
        prob = ( tri_dict[tri] + 1 ) / ( bi_dict[bi_sen] + V)

        #tri_prob_dict is a dict of list
        if bi_sen not in tri_prob_dict:
            tri_prob_dict[bi_sen] = []
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
        else:
            tri_prob_dict[bi_sen].append([prob,tri_token[-1]])
    print('Tri_Prb_dict len:',len(tri_prob_dict))
    prob = None
    tri_token = None
    bi_sen = None

In [None]:
#for creating prob dict for bigram probabilities
def findBigramProb(vocab_dict, bi_dict, bi_prob_dict):
    print(len(vocab_dict),len(bi_dict))
    V = len(vocab_dict)
    #create a dictionary of probable words with their probabilities for bigram probabilites
    #with open('bi_dict.txt','r') as file:
    for bi in bi_dict:
    #for bi in file:
        bi_token = bi.split()
        #unigram for key
        unigram = bi_token[0]
        #find the probability

        #add i smoothing has been used
        prob = ( bi_dict[bi] + 1 ) / ( vocab_dict[unigram] + V)

        #bi_prob_dict is a dict of list
        if unigram not in bi_prob_dict:
            bi_prob_dict[unigram] = []
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
        else:
            bi_prob_dict[unigram].append([prob,bi_token[-1]])
    print('Bi_Prb_dict len:',len(bi_prob_dict))
    prob = None
    bi_token = None
    unigram = None

## <u>Driver function for doing the prediction</u>

In [None]:
#returns: string
#arg: string,dict,int
#does prediction for the the sentence
def doPrediction(sen, prob_dict, rank = 1):
    if sen in prob_dict:
        if rank <= len(prob_dict[sen]):
            return prob_dict[sen][rank-1][1]
        else:
            return prob_dict[sen][0][1]
    else:
        return "Can't predict"

In [None]:
def doInterpolatedPrediction(sen, bi_dict, tri_dict, quad_dict, 
                             vocab_dict,token_len, word_choice, param):
    pred = ''
    max_prob = 0.0
    V = len(vocab_dict)
    #for each word choice find the interpolated probability and decide
    for word in word_choice:
        key = sen + ' ' + word[1]
        quad_token = key.split()
        print('quad_dict['+key+']:'+str(quad_dict[key]),'  tri_dict['+' '.join(quad_token[0:3])+']:'+str(tri_dict[' '.join(quad_token[0:3])]),
             ' Res:',(quad_dict[key] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V))
        #print('tri_dict['+' '.join(quad_token[0:3])+']:'+str(tri_dict[' '.join(quad_token[0:3])]))
        print('tri_dict['+' '.join(quad_token[1:4])+']:'+str(tri_dict[' '.join(quad_token[1:4])]),'  bi_dict['+' '.join(quad_token[1:3])+']:'+str(bi_dict[' '.join(quad_token[1:3])]),
             ' Res:',(tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V))
        #print('bi_dict['+' '.join(quad_token[1:3])+']:'+str(bi_dict[' '.join(quad_token[1:3])]))
        print('bi_dict['+' '.join(quad_token[2:4])+']:'+ str(bi_dict[' '.join(quad_token[2:4])]),'  vocab_dict['+quad_token[2]+']:'+str(vocab_dict[quad_token[2]]),
             ' Res:',(bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V))
        #print('vocab_dict['+quad_token[2]+']:'+str(vocab_dict[quad_token[2]]))
        print('vocab_dict['+quad_token[3]+']:'+str(vocab_dict[quad_token[3]]),'  token_len:'+str(token_len),
             ' Res:',(vocab_dict[quad_token[3]] + 1) / (token_len + V))
        #print('token_len:'+str(token_len))
        prob = (   
                  param[0]*((quad_dict[key] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V)) 
                + param[1]*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V)) 
                + param[2]*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V)) 
                + param[3]*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
               )
        
        print(word[1],prob,param[0]*((quad_dict[key] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V))," ",
             param[1]*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V))," ",
             param[2]*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V))," ",
             param[3]*((vocab_dict[quad_token[3]] + 1) / (token_len + V)))
        print('\n\n')
        if prob > max_prob:
            max_prob = prob
            pred = word
    return pred

## <u>For Taking input from the User</u>

In [None]:
#returns: string
#arg: void
#for taking input from user
def takeInput():
    cond = False
    #take input
    while(cond == False):
        sen = input('Enter the string\n')
        sen = removePunctuations(sen)
        temp = sen.split()
        if len(temp) < 3:
            print("Please enter atleast 3 words !")
        else:
            cond = True
            temp = temp[-3:]
    sen = " ".join(temp)
    return sen

## <u>Test Score ,Perplexity Calculation:</u>

### For computing the Test Score

In [None]:
#return:int
#arg:list,dict,dict,dict,dict
#computes the score for test data
def computeTestScore(test_sent,tri_dict,quad_dict,vocab_dict,prob_dict):
    #increment the score value if correct prediction is made else decrement its value
    score = 0
    w = open('test_result.txt','w')
    for sent in test_sent:
        sen_token = sent[:3]
        sen = " ".join(sen_token)
        correct_word = sent[3]
        #     print(sen,':',correct_word)

        result = doPrediction(sen,prob_dict)
        if result == correct_word:
            s = sen +" : "+result+'\n'
            w.write(s)
            score+=1

    w.close()
    return score

### For Computing the Perplexity

In [None]:
#return:float
#arg:list,int,dict,dict,dict,dict
#computes the score for test data
def computePerplexity(test_quadgrams,token_len,tri_dict,quad_dict,vocab_dict,prob_dict):
    
    perplexity = float(1.0)
    n = token_len

    for item in quad_dict:
        sen_token = item.split()
        sen = ' '.join(sen_token[0:3])
        prob = quad_dict[item]/tri_dict[sen]
        perplexity = perplexity * ( prob**(1./n))
    
    return perplexity

## <u> For Computing Interpolated Probability</u>

In [None]:
#returns: float
#arg: list,list,dict,dict,dict,dict,float,float,float,float
#for calculating the interpolated probablity
def interpolatedProbability(quad_token,token_len, vocab_dict, bi_dict, tri_dict, quad_dict, qc, tc, bc,
                            l1 = 0.25, l2 = 0.25, l3 = 0.25 , l4 = 0.25):
    V = len(vocab_dict)
    #with open('prob_stats.txt','w') as file:
        #for picking the word we select the highest 
    sen = ' '.join(quad_token)
    prob = (   
              l1*((quad_dict[sen] + 1)/ (tri_dict[' '.join(quad_token[0:3])] + V)) 
            + l2*((tri_dict[' '.join(quad_token[1:4])] + 1) / (bi_dict[' '.join(quad_token[1:3])] + V)) 
            + l3*((bi_dict[' '.join(quad_token[2:4])] + 1) / (vocab_dict[quad_token[2]] + V)) 
            + l4*((vocab_dict[quad_token[3]] + 1) / (token_len + V))
           )
    if sen  in quad_dict:
        qc[0] += 1
    if ' '.join(quad_token[1:4]) in tri_dict:
        tc[0] += 1
    if ' '.join(quad_token[2:4])  in bi_dict:
        bc[0] += 1    
    #since log10(1) is zero so it doesn't add upto anything but log10(0) is undefined
    #t = open('temp.txt','a')
    #t.write('sen:'+sen+' quadKey:'+sen+','+str(quad_dict[sen])+'  triKey:'+' '.join(quad_token[1:4])+','+str(tri_dict[' '.join(quad_token[1:4])])+'  biKey:'+' '.join(quad_token[2:4])+','+str(bi_dict[' '.join(quad_token[2:4])])+'\n')
    #t.close()
    if prob <= 0:
        return 1
    #print(prob)
    return prob

In [None]:
#computes the knesser kney probability
def computeKnesserNeyProb(tri_dict, quad_dict, quad_prob_dict):
    #for knesser ney probability formula we need to find to important things 
    #first is for P(Wn|wn-1) if find no. of ngrams which ends with wn and no. of ngrams which starts 
    #with wn-1
    #so we divide the formula into two parts ,first part can be found in constant time
    #and second term is found here
    i = 0
    d = 0.75
    #for storing count of quadgrams ending with wn
    first_dict = {}
    #for storing count of quadgrams having wn-1 as its starting part
    sec_dict = {}
    
    for quad in quad_dict:
        #split the quad sentence into tokens 
        quad_token = quad.split()
        #for keeping track of count where wn finishes a quadgram
        c1 = 0
        #for keeping track of count where wn-1 finsishes a quadgram
        c2 = 0
        tri_sen = ' '.join(quad_token[:3])
        
        #now start looking in the quadgram dict
        for quad1 in quad_dict:
            quad_token1 = quad1.split()
            if quad_token1[-1] == quad_token[-1]:
                c1 += 1
            if ' '.join(quad_token1[:3]) == tri_sen:
                c2 += 1
        
        ############################################
        #########################################3
# #!!!!!!!#COMMENT THIS FOR DIFFERENT CORPUS R.THIS IS REQUIRED FOR THIS CORPUS ONLY
        print(tri_sen,tri_dict[tri_sen])
        if tri_dict[tri_sen] == 0:
            tri_dict[tri_sen] = 1
        #########################################
        
        prob = ( max(quad_dict[quad]-d,0) / tri_dict[tri_sen] ) + ( (c1/len(quad_dict)) * (d*c2/tri_dict[tri_sen]) ) 
        
        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])

In [8]:
#computes the knesser kney probability
def computeKnesserNeyProb1(tri_dict, quad_dict, quad_prob_dict):
    #for knesser ney probability formula we need to find to important things 
    #first is for P(Wn|wn-1) if find no. of ngrams which ends with wn and no. of ngrams which starts 
    #with wn-1
    #so we divide the formula into two parts ,first part can be found in constant time
    #and second term is found here
    i = 0
    d = 0.75
    #for storing count of quadgrams ending with wn,key:unigram
    first_dict = {}
    #for storing count of quadgrams having wn-1 as its starting part, key: trigram sentence
    sec_dict = {}
    
    for quad in quad_dict:
        #split the quad sentence into tokens 
        quad_token = quad.split()
        
        tri_sen = ' '.join(quad_token[:3])
        
        #tri_sen is the word that has stars in sec_dict[tri_sen] number of times in quad_dict 
        if tri_sen not in sec_dict:
            sec_dict[ tri_sen ] = 1
        else:
            sec_dict[ tri_sen ] += 1
            
        if quad_token[-1] not in first_dict:
            first_dict[ quad_token[-1] ] = 1
        else:
            first_dict[ quad_token[-1] ] += 1
   """"""         
    for quad in quad_dict:
        quad_token = quad.split()
        tri_sen = ' '.join(quad_token[:3])
        ############################################
        #########################################3
        # #!!!!!!!#COMMENT THIS FOR DIFFERENT CORPUS R.THIS IS REQUIRED FOR THIS CORPUS ONLY
        if tri_dict[tri_sen] == 0:
            tri_dict[tri_sen] = 1
        #########################################
    
        prob = ( 
                ( max(quad_dict[quad]-d,0) / tri_dict[tri_sen] ) + 
                ( ( first_dict[quad_token[-1]]/len(quad_dict)) * (d*sec_dict[tri_sen]/tri_dict[tri_sen]) ) 
               )
        #add the word to probability dictionary
        if tri_sen not in quad_prob_dict:
            quad_prob_dict[tri_sen] = []
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
        else:
            quad_prob_dict[tri_sen].append([prob,quad_token[-1]])
    """"

## <u>Driver Function for Testing the Language Model</u>

In [9]:
#return: void
#arg:string,string,dict,dict,dict,dict,dict
#Used for testing the Language Model
def trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,prob_dict):
      
    test_result = ''
    score = 0
    #load the training corpus for the dataset
    token_len = loadCorpus1(bi_dict,tri_dict,quad_dict,vocab_dict)
    print("---Processing Time for Corpus Loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    
    #for writing the quad_dict to a file
    #writeQuads(bi_dict,tri_dict,quad_dict)
    
    #param = estimateParameters(token_len, vocab_dict, bi_dict, tri_dict, quad_dict)
    #print(param)
    
    #found using estimateParameters(..) fucntion
    #param = [0,0.1,0,0.9]
    
    #creates a dictionary of probable words 
    #findQuadgramProb(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)
    
    #sort the dictionary of probable words 
    #sortProbWordDict(prob_dict)
    
    gc.collect()
    print("---Processing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))
    return token_len
"""
    test_data = ''
    #Now load the test corpus
    with open('testing_corpus.txt','r') as file :
        test_data = file.read()

    #remove punctuations from the test data
    test_data = removePunctuations(test_data)
    test_token = test_data.split()

    #split the test data into 4 words list
    test_token = test_data.split()
    test_quadgrams = list(ngrams(test_token,4))

    #print(len(test_token))
    start_time1 = time.time()
    score = computeTestScore(test_quadgrams,tri_dict,quad_dict,vocab_dict,prob_dict)
    print('Score:',score)
    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time1))

    start_time2 = time.time()
    perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)
    print('Perplexity:',perplexity)
    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time2))

    test_result += 'TEST RESULTS\nScore: '+str(score) + '\nPerplexity: '+str(perplexity)
    with open('test_results.txt','w') as file:
      	file.write(test_result)
"""

'\n    test_data = \'\'\n    #Now load the test corpus\n    with open(\'testing_corpus.txt\',\'r\') as file :\n        test_data = file.read()\n\n    #remove punctuations from the test data\n    test_data = removePunctuations(test_data)\n    test_token = test_data.split()\n\n    #split the test data into 4 words list\n    test_token = test_data.split()\n    test_quadgrams = list(ngrams(test_token,4))\n\n    #print(len(test_token))\n    start_time1 = time.time()\n    score = computeTestScore(test_quadgrams,tri_dict,quad_dict,vocab_dict,prob_dict)\n    print(\'Score:\',score)\n    print("---Processing Time for computing score: %s seconds ---" % (time.time() - start_time1))\n\n    start_time2 = time.time()\n    perplexity = computePerplexity(test_token,token_len,tri_dict,quad_dict,vocab_dict,prob_dict)\n    print(\'Perplexity:\',perplexity)\n    print("---Processing Time for computing Perplexity: %s seconds ---" % (time.time() - start_time2))\n\n    test_result += \'TEST RESULTS\nScore: \

## <u>main function</u>

In [None]:
"""
def main():

    #variable declaration
    tri_dict = defaultdict(int)            #for keeping count of sentences of three words
    quad_dict = defaultdict(int)           #for keeping count of sentences of three words
    vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
    prob_dict = OrderedDict()              #for storing the probabilities of probable words for a sentence
    bi_dict = defaultdict(int)

    #load the corpus for the dataset
    loadCorpus('corpusfile.txt',tri_dict,quad_dict,vocab_dict)
    print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))

    start_time1 = time.time()
    #for writing the quad_dict to a file
    writeQuads(quad_dict)
    #creates a dictionary of probable words 
    createProbableWordDict(tri_dict,quad_dict,prob_dict)
    #sort the dictionary of probable words 
    sortProbWordDict(prob_dict)

    gc.collect()
    print("---Preprocessing Time for Creating Probable Word Dict: %s seconds ---" % (time.time() - start_time1))

    input_sen = takeInput()

    start_time2 = time.time()
    prediction = doPrediction(input_sen,prob_dict)
    print('Word Prediction:',prediction)
    print("---Time for Prediction Operation: %s seconds ---" % (time.time() - start_time2))
""""

In [None]:
"""
if __name__ == '__main__':
    main()
"""

## <i><u>For Debugging Purpose Only</u></i>
<i>Uncomment the above two cells and ignore running the cells below if not debugging</i>

In [10]:
#variable declaration
vocab_dict = defaultdict(int)          #for storing the different words with their frequencies    
bi_dict = defaultdict(int)             #for keeping count of sentences of two words
tri_dict = defaultdict(int)            #for keeping count of sentences of three words
quad_dict = defaultdict(int)           #for keeping count of sentences of four words
quad_prob_dict = OrderedDict()              
tri_prob_dict = OrderedDict()
bi_prob_dict = OrderedDict()

#load the corpus for the dataset
#loadCorpus('corpusfile.txt',bi_dict,tri_dict,quad_dict,vocab_dict)
print("---Preprocessing Time for Corpus loading: %s seconds ---" % (time.time() - start_time))

---Preprocessing Time for Corpus loading: 60.84304690361023 seconds ---


In [11]:
train_file = 'training_corpus.txt'
test_file = 'test_corpus.txt'
#load the corpus for the dataset
token_len = trainCorpus(train_file,test_file,bi_dict,tri_dict,quad_dict,vocab_dict,quad_prob_dict)


1020385 1020009 1034307
---Processing Time for Corpus Loading: 74.39337587356567 seconds ---
---Processing Time for Creating Probable Word Dict: 0.059902191162109375 seconds ---


In [12]:
computeKnesserNeyProb1(tri_dict, quad_dict, quad_prob_dict)

In [None]:
findBigramProb(vocab_dict, bi_dict, bi_prob_dict)

In [None]:
findQuadgramProb(vocab_dict, bi_dict, tri_dict, quad_dict, quad_prob_dict)

In [13]:
sortProbWordDict(bi_prob_dict, tri_prob_dict, quad_prob_dict)

In [14]:
writeProbDicts(bi_prob_dict, tri_prob_dict, quad_prob_dict)
gc.collect()

0