In [49]:
import nltk
from nltk import bigrams,trigrams 
from nltk.corpus import reuters
from collections import Counter, defaultdict
from gensim.test.utils import datapath
from gensim.corpora import WikiCorpus



WikiDatasetPath = datapath('enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2')
WikiSentences = WikiCorpus(WikiDatasetPath).get_texts()
print(WikiSentences)

<generator object WikiCorpus.get_texts at 0x7f0ca08cc0f8>


In [50]:
# punkt tokenizer
nltk.download('punkt')
# nltk reuters dataset
nltk.download('reuters')
ReutersSentences  = reuters.sents()
print(ReutersSentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]


In [51]:
# calculating probabilities for every possible next words
def calculateWordsProbabilities(sentenceModel):
  for nextWord in sentenceModel:
    nextWords = sentenceModel[nextWord]
    totalWordCount = float(sum(nextWords.values()))
    for previousWord in nextWords:
      nextWords[previousWord] /= totalWordCount

In [52]:
# calculating single word probabilities among every words
def calculateSingleWordProbability(sentenceModel,wordCount):
  for word in sentenceModel:
    sentenceModel[word] /= wordCount

In [53]:
# converting any string to lower
def convertToLower(s):
  if type(s)==str:
    return s.lower()
  return s

In [54]:
sentenceModel4 = defaultdict(lambda: set()) # default value of model's keys is set as set
sentenceModel5 = defaultdict(lambda: set())

# calculating overall word count in the given sentence model
def calculateWordCount(sentenceModel1,sentenceModel2,sentenceModel3,sentences):
  wordCount = 0
  for sentence in sentences:
    for word in sentence:
      wordCount += 1
      sentenceModel1[word] += 1 #storing counts of each word in the first model
    for previousWord2,previousWord1,nextWord in trigrams(sentence,pad_right=True,pad_left=True):
      previousWord1 = convertToLower(previousWord1)
      previousWord2 = convertToLower(previousWord2)
      nextWord = convertToLower(nextWord)
      sentenceModel2[nextWord][previousWord1] += 1 # storing count of just the previous words in case of specific word occuring
      sentenceModel3[nextWord][previousWord2] += 1 # storing count of 2nd previous words in case of specific word occuring
      sentenceModel4[previousWord1].add(nextWord) # adding new word based on the just previous word in the trigram to the sentence model
      sentenceModel5[previousWord2].add(nextWord) # adding new word based on the 2nd previous word in the trigram to the sentence model

  return wordCount

In [55]:
sentenceModel1 = defaultdict(lambda:0)
sentenceModel2 = defaultdict(lambda: defaultdict(lambda:0)) # The argument will be called when we try to access a key that doesn't exist
sentenceModel3 = defaultdict(lambda: defaultdict(lambda:0))

WikiWordCount = calculateWordCount(sentenceModel1,sentenceModel2,sentenceModel3,WikiSentences) # counting words in wiki corpus dataset
print(WikiWordCount)

452944


In [56]:
ReutersWordCount = calculateWordCount(sentenceModel1,sentenceModel2,sentenceModel3,ReutersSentences) # counting words in Reuters dataset
print(ReutersWordCount)

1720917


In [57]:
calculateWordsProbabilities(sentenceModel2) # checking probabilities in model 2
calculateWordsProbabilities(sentenceModel3) # checking probabilities in model 3

totalWord = WikiWordCount + ReutersWordCount
calculateSingleWordProbability(sentenceModel1,totalWord) # checking each word probability in model 1

In [58]:
ProbabilityWordsList = [] 
# getting word suggestion by placing two words
def WordSuggestionsByTrigram(previousWord2,previousWord1): 
  for nextWord in sentenceModel4[previousWord1] & sentenceModel5[previousWord2]:
    naiveBayesTrigramValue = sentenceModel1[nextWord]*sentenceModel2[nextWord][previousWord1]*sentenceModel3[nextWord][previousWord2] # using naive bayes to get the weight of the each trigram 
    ProbabilityWordsList.append((nextWord,naiveBayesTrigramValue)) # storing the predicted words and the weights of trigram 


In [59]:
# test of suggestions given by placing words "i have" 
WordSuggestionsByTrigram('i','have')
ProbabilityWordsList.sort(key=lambda o:o[1],reverse=True)
print(*ProbabilityWordsList[:10])

('not', 3.385947855674401e-07) ('been', 2.524795704851632e-07) ('no', 9.284582792204195e-08) ('talked', 7.240914088224012e-08) ('deeply', 6.272877445413316e-08) ('to', 5.211120999664473e-08) ('encouraged', 3.606486339282962e-08) ('committed', 2.872077082187158e-08) ('nothing', 2.564017843979544e-08) ('never', 2.2837426160606398e-08)


In [None]:
while(True):
    text = input("Enter your Sentence: ")
    if text == "":
        print("Stopping The Program.....")
        break
    
    else:
        try:
            ProbabilityWordsList = [] 
            text = text.split(" ")
            WordSuggestionsByTrigram(text[0],text[1])
            ProbabilityWordsList.sort(key=lambda o:o[1],reverse=True)
            print(*ProbabilityWordsList[:10])
            
        except:
            continue

Enter your Sentence: how will
('proceed', 1.5090538782291351e-07) ('take', 9.567398144104542e-08) ('affect', 9.25827376927431e-08) ('come', 7.443465428810637e-08) ('interpret', 7.077092502517588e-08) ('make', 6.015915796212672e-08) ('become', 5.640078870849599e-08) ('use', 5.615341814810161e-08) ('increase', 4.462008976316627e-08) ('determine', 4.2299863233438456e-08)
Enter your Sentence: we shall
('fare', 4.600110126636432e-07) ('be', 3.386337237335402e-08) ('not', 2.7202118079413788e-08) ('touch', 1.6428964737987255e-08) ('probably', 7.616076368603364e-09) ('have', 6.670696472976096e-09) ('continue', 5.769111788649496e-09) ('use', 5.104856195281964e-09) ('immediately', 2.0675112263718385e-09) ('any', 1.068033779773917e-09)
Enter your Sentence: will you
('have', 4.5482021406655204e-08) ('need', 3.227643487147699e-08) ('saved', 2.2936837196525143e-08) ('asked', 1.739182376766841e-08) ('do', 1.5861249283525236e-08) ('cannot', 1.5856039804193656e-08) ('assess', 1.533370042212144e-08) ('c