In [29]:
import nltk
from collections import Counter, defaultdict

# Preparing Corpus

In [30]:
def getReutersSentences():
  nltk.download('reuters')
  nltk.download('punkt')
  from nltk.corpus import reuters
  return reuters.sents()


In [31]:
def getBrownSentences():
  nltk.download('brown')
  from nltk.corpus import brown
  return brown.sents()

In [32]:
def getWikiSentenes():
  from gensim.test.utils import datapath
  from gensim.corpora import WikiCorpus
  path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
  wiki=WikiCorpus(path_to_wiki_dump)
  return wiki.get_texts()

In [33]:
'''
Combine all sentences
'''

reutersSentences=getReutersSentences()
# brownSentences=getBrownSentences()
wikiSentences=getWikiSentenes()

sentencesCorpus=[]

for s in wikiSentences:
  sentencesCorpus.append(s)

for s in reutersSentences:
  sentencesCorpus.append(s)

# for s in brownSentences:
#   sentencesCorpus.append(s)

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
print(len(reutersSentences))
print(len(sentencesCorpus))

54716
54822


# Naive Bayes Model

In [35]:
'''
Modeling Naive Bayes
'''
class NaiveBayes:
  def __init__(self):
    pass
  
  def tryToLower(self,u):
    if type(u)==str:
      return u.lower()
    return u

  def countWordFrequency(self):
    self.wordCount=0
    self.wordFrequency = defaultdict(lambda: 0)
    for sentence in self.sentences:
      for word in sentence:
        word=self.tryToLower(word)
        self.wordFrequency[word]+=1
        self.wordCount+=1

  def countBigramFrequency(self):
    from nltk import bigrams
    self.bigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.nextWords = defaultdict(lambda: set())

    for sentence in self.sentences:
      for wPrev, wCurrent in bigrams(sentence, pad_right=True, pad_left=True):
        wCurrent=self.tryToLower(wCurrent)
        wPrev=self.tryToLower(wPrev)
        self.bigramFrequency[wCurrent][wPrev]+=1
        self.nextWords[wPrev].add(wCurrent)

  def countTrigramFrequency(self):
    from nltk import trigrams
    self.bigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.partialTrigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.nextWords = defaultdict(lambda: set())
    self.secondNextWords = defaultdict(lambda: set())

    for sentence in  self.sentences:
      for wPrev2, wPrev1, wCurrent in trigrams(sentence, pad_right=True, pad_left=True):
          wPrev2=self.tryToLower(wPrev2)
          wPrev1=self.tryToLower(wPrev1)
          wCurrent=self.tryToLower(wCurrent)
          self.bigramFrequency[wCurrent][wPrev1] += 1 # number of times w2 is previous word when w3 is current word
          self.partialTrigramFrequency[wCurrent][wPrev2] += 1 # number of times w1 is 2nd previous word when w3 is current word
          self.nextWords[wPrev1].add(wCurrent)
          self.secondNextWords[wPrev2].add(wCurrent)
    
  def calculateConditionalProbablities(self,model):
    for wCurrent in model:
      total_count = float(sum(model[wCurrent].values()))
      for wPrev in model[wCurrent]:
        model[wCurrent][wPrev] /= total_count

  def calculateProbablity(self):
    for word in self.wordFrequency:
      self.wordFrequency[word] /=self.wordCount
  
  def calculateNaiveBayesBigram(self,wPrev,wCurrent):
    return self.wordFrequency[wCurrent]*self.bigramFrequency[wCurrent][wPrev]
   
  def calculateNaiveBayesTrigram(self,wPrev2,wPrev1,wCurrent):
    return self.wordFrequency[wCurrent]*self.bigramFrequency[wCurrent][wPrev1]* self.partialTrigramFrequency[wCurrent][wPrev2]

  def trainGivenWord(self,corpus):
    self.sentences=corpus

    self.countWordFrequency()
    self.countBigramFrequency()
    self.calculateConditionalProbablities(self.bigramFrequency)
    self.calculateProbablity()
  
  def trainGiven2Word(self,corpus):
    self.sentences=corpus
    
    self.countWordFrequency()
    self.countTrigramFrequency()
    self.calculateConditionalProbablities(self.bigramFrequency)
    self.calculateConditionalProbablities(self.partialTrigramFrequency)
    self.calculateProbablity()

  def predictGivenOneWord(self,wPrev):
    predictions=[]
    for wNext in self.nextWords[wPrev]:
      sc=self.calculateNaiveBayesBigram(wPrev,wNext)
      predictions.append((wNext,sc))
    predictions.sort(key=lambda o: o[1],reverse=True)
    return predictions;
  
  def predictGiven2Word(self,wP2,wP1):
    predictions=[]
    for wNext in self.nextWords[wP1] & self.secondNextWords[wP2]:
      sc=self.calculateNaiveBayesTrigram(wP2,wP1,wNext)
      predictions.append((wNext,sc))
    predictions.sort(key=lambda o: o[1],reverse=True)
    return predictions


# Applying Naive Bayes on Corpus

In [36]:
'''
Applying Naive bayes given One word 
'''

nb=NaiveBayes()
nb.trainGivenWord(sentencesCorpus)
secondWords=nb.predictGivenOneWord('is')
print(secondWords)



In [37]:
'''
Applying Naive bayes given Two word 
'''

nbt=NaiveBayes()
nbt.trainGiven2Word(sentencesCorpus)
thirdWords=nbt.predictGiven2Word('is','the')
print(thirdWords)

[('first', 1.1230247545308053e-05), ('largest', 8.776546244970887e-06), ('most', 4.957163150812947e-06), ('world', 4.237934309173298e-06), ('second', 3.834429935177943e-06), ('main', 3.771881682067424e-06), ('only', 3.625161413229904e-06), ('possibility', 3.206242170971408e-06), ('same', 2.886704401821027e-06), ('oldest', 2.6943502170299097e-06), ('highest', 2.3809054837257654e-06), ('country', 2.3760201846755153e-06), ('predominant', 2.300055063318216e-06), ('next', 2.007179660430002e-06), ('case', 1.938009358907015e-06), ('third', 1.862854514092439e-06), ('fact', 1.7957056397954267e-06), ('u', 1.7922581757470854e-06), ('fourth', 1.756405684715728e-06), ('best', 1.7138316428479416e-06), ('major', 1.6851046500869955e-06), ('name', 1.6654413305194673e-06), ('federal', 1.6481623988300014e-06), ('latest', 1.6048946512098984e-06), ('need', 1.5631442177890789e-06), ('number', 1.5499470156414644e-06), ('nation', 1.4605349652070673e-06), ('problem', 1.3355710070271575e-06), ('study', 1.307983

# Model Evaluation 

In [38]:
!pip install language_tool_python



In [39]:
'''
Setting up python language tool to check grammer 
'''

import language_tool_python
tool = language_tool_python.LanguageTool('en-US')
def checkGrammaticalMistakes(words=[]):
  text = ' '.join(words)  
  matches = tool.check(text)
  return len(matches)-1



In [40]:
def getEvaluationPercantage(givenWords,predictedWords):
  total=len(predictedWords)
  correct=0
  for pWord in predictedWords:
    if checkGrammaticalMistakes(givenWords+[pWord[0]])==0:
      correct+=1
  
  return (correct/total)*100 

In [41]:
'''
got score 99.0 when given one word
'''
print(getEvaluationPercantage(['is'],secondWords[:100]))

99.0


In [None]:
'''
got score 97.0 when given two word less than given one word because naive bayes' independance property
'''
print(getEvaluationPercantage(['is','the'],thirdWords[:100]))

97.0
