In [22]:
import nltk
from collections import Counter, defaultdict

# Preparing Corpus

In [23]:
def getReutersSentences():
  nltk.download('reuters')
  nltk.download('punkt')
  from nltk.corpus import reuters
  return reuters.sents()


In [24]:
def getWikiSentenes():
  from gensim.test.utils import datapath
  from gensim.corpora import WikiCorpus
  path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
  wiki=WikiCorpus(path_to_wiki_dump)
  return wiki.get_texts()

In [25]:
'''
Combine all sentences
'''

reutersSentences=getReutersSentences()
wikiSentences=getWikiSentenes()

sentencesCorpus=[]

for s in wikiSentences:
  sentencesCorpus.append(s)

for s in reutersSentences:
  sentencesCorpus.append(s)

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
print(len(reutersSentences))
print(len(sentencesCorpus))

54716
54822


# Naive Bayes Model

In [27]:
'''
Modeling Naive Bayes
'''
class NaiveBayes:
  def __init__(self):
    pass
  
  def tryToLower(self,u):
    if type(u)==str:
      return u.lower()
    return u

  def countWordFrequency(self):
    self.wordCount=0
    self.wordFrequency = defaultdict(lambda: 0)
    for sentence in self.sentences:
      for word in sentence:
        word=self.tryToLower(word)
        self.wordFrequency[word]+=1
        self.wordCount+=1

  def countBigramFrequency(self):
    from nltk import bigrams
    self.bigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.nextWords = defaultdict(lambda: set())

    for sentence in self.sentences:
      for wPrev, wCurrent in bigrams(sentence, pad_right=True, pad_left=True):
        wCurrent=self.tryToLower(wCurrent)
        wPrev=self.tryToLower(wPrev)
        self.bigramFrequency[wCurrent][wPrev]+=1
        self.nextWords[wPrev].add(wCurrent)

  def countTrigramFrequency(self):
    from nltk import trigrams
    self.bigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.partialTrigramFrequency = defaultdict(lambda: defaultdict(lambda: 0))
    self.nextWords = defaultdict(lambda: set())
    self.secondNextWords = defaultdict(lambda: set())

    for sentence in  self.sentences:
      for wPrev2, wPrev1, wCurrent in trigrams(sentence, pad_right=True, pad_left=True):
          wPrev2=self.tryToLower(wPrev2)
          wPrev1=self.tryToLower(wPrev1)
          wCurrent=self.tryToLower(wCurrent)
          self.bigramFrequency[wCurrent][wPrev1] += 1 # number of times w2 is previous word when w3 is current word
          self.partialTrigramFrequency[wCurrent][wPrev2] += 1 # number of times w1 is 2nd previous word when w3 is current word
          self.nextWords[wPrev1].add(wCurrent)
          self.secondNextWords[wPrev2].add(wCurrent)
    
  def calculateConditionalProbablities(self,model):
    for wCurrent in model:
      total_count = float(sum(model[wCurrent].values()))
      for wPrev in model[wCurrent]:
        model[wCurrent][wPrev] /= total_count

  def calculateProbablity(self):
    for word in self.wordFrequency:
      self.wordFrequency[word] /=self.wordCount
  
  def calculateNaiveBayesBigram(self,wPrev,wCurrent):
    return self.wordFrequency[wCurrent]*self.bigramFrequency[wCurrent][wPrev]
   
  def calculateNaiveBayesTrigram(self,wPrev2,wPrev1,wCurrent):
    return self.wordFrequency[wCurrent]*self.bigramFrequency[wCurrent][wPrev1]* self.partialTrigramFrequency[wCurrent][wPrev2]

  def trainGivenWord(self,corpus):
    self.sentences=corpus

    self.countWordFrequency()
    self.countBigramFrequency()
    self.calculateConditionalProbablities(self.bigramFrequency)
    self.calculateProbablity()
  
  def trainGiven2Word(self,corpus):
    self.sentences=corpus
    
    self.countWordFrequency()
    self.countTrigramFrequency()
    self.calculateConditionalProbablities(self.bigramFrequency)
    self.calculateConditionalProbablities(self.partialTrigramFrequency)
    self.calculateProbablity()

  def predictGivenOneWord(self,words=[]):
    wPrev=words[0]
    self.givenWords=[wPrev]
    self.predictions=[]
    for wNext in self.nextWords[wPrev]:
      sc=self.calculateNaiveBayesBigram(wPrev,wNext)
      self.predictions.append((wNext,sc))
    self.predictions.sort(key=lambda o: o[1],reverse=True)
    return self.predictions;
  
  def predictGiven2Word(self,words=[]):
    wP2,wP1=words
    self.givenWords=[wP2,wP1]
    self.predictions=[]
    for wNext in self.nextWords[wP1] & self.secondNextWords[wP2]:
      sc=self.calculateNaiveBayesTrigram(wP2,wP1,wNext)
      self.predictions.append((wNext,sc))
    self.predictions.sort(key=lambda o: o[1],reverse=True)
    return self.predictions


# Applying Naive Bayes on Corpus

In [28]:
'''
Applying Naive bayes given One word 
'''
TestData=['is']
nbOneWordModel=NaiveBayes()
nbOneWordModel.trainGivenWord(sentencesCorpus)
secondWords=nbOneWordModel.predictGivenOneWord(TestData)
secondWords[:10]

[('the', 0.00042183009861256076),
 ('not', 0.00023920572658509443),
 ('expected', 0.00023552563848378528),
 ('a', 0.00019642470240737562),
 ('also', 0.00012006287430521087),
 ('to', 0.00011592277519123807),
 ('an', 9.614230164670142e-05),
 ('in', 9.384224658338319e-05),
 ('likely', 8.234197126679214e-05),
 ('still', 8.050192721613755e-05)]

In [29]:
'''
Applying Naive bayes given Two word 
'''
TestData=['is','the']

nbTwoWordModel=NaiveBayes()
nbTwoWordModel.trainGiven2Word(sentencesCorpus)
thirdWords=nbTwoWordModel.predictGiven2Word(TestData)
thirdWords[:10]

[('first', 1.1230247545308053e-05),
 ('largest', 8.776546244970887e-06),
 ('most', 4.957163150812947e-06),
 ('world', 4.237934309173298e-06),
 ('second', 3.834429935177943e-06),
 ('main', 3.771881682067424e-06),
 ('only', 3.625161413229904e-06),
 ('possibility', 3.206242170971408e-06),
 ('same', 2.886704401821027e-06),
 ('oldest', 2.6943502170299097e-06)]

# Model Evaluation 

In [30]:
!pip install language_tool_python



In [31]:
 import language_tool_python
 class NaiveBayesModelEvaluation:
  def __init__(self): 
     self.tool = language_tool_python.LanguageTool('en-US')
  
  def checkGrammaticalMistakes(self,words=[]):
    '''
    Setting up python language tool to check grammer 
    '''
    text = ' '.join(words)  
    matches = self.tool.check(text)
    return len(matches)-1
  
  def score(self,model,top=100):
    topPredctions=model.predictions[:top]
    total=len(topPredctions)
    correct=0
    for pWord in topPredctions:
      if self.checkGrammaticalMistakes(model.givenWords+[pWord[0]])==0:
        correct+=1
  
    return (correct/total)*100

modelEval= NaiveBayesModelEvaluation()

In [34]:
'''
got score 99.0 for top 100 when given one word

Function Defination :
  nb.score(top=20) -> top = best 20 suggestions 
'''

print(modelEval.score(nbOneWordModel,top=30))

99.0


In [35]:
'''
got score 97.0 for top 100 when given two word less than given one word because naive bayes' independance property

Function Defination :
  nbt.score(top=20) -> top = best 20 suggestions
'''
print(modelEval.score(nbTwoWordModel,top=30))

97.0
