In [1]:
import glob
import json
import os
import pickle

In [2]:
import re
from nltk import bigrams,trigrams
from collections import Counter

In [44]:
#stopwords to prevent bad recommendations like conjunctions and prepositions
stopwords=['the','as','he','she','they','a','into','of','with','this','in','and','for','is','was','an','on','at','to']

In [109]:
# Q1: Corpus Creation
def extract_body_text ( filename :str ) -> str :#Code that was given
    file = open ( filename )
    paper_content = json.load ( file )
    body_text = ""
    if 'body_text' in paper_content :
      for bt in paper_content ['body_text']:
        body_text = body_text + bt['text']
    return ( body_text + '\n'). lower ()


for name in glob.glob('pdf_json/*'):#writing files
    body = extract_body_text(name)
    f = open('textfiles/'+name[8:-4]+'txt','w',encoding='utf-8')
    f.write(body)

In [110]:
#Q2- Preprocessing step
parent_list = os.listdir("textfiles")
for child in parent_list:
      f=open('textfiles/'+child,'r',encoding='utf-8')
      pptext = f.read()
      pptext = re.sub(r'[,\d+%?]',' ',pptext)
      pptext = re.sub(r'([^a-zA-Z\s\.\-]|[\[\]])','',pptext)#removing foreign characters and brackets
      pptext = re.sub(r'-','- ',pptext)
      pptext = pptext.lower().split()# makes a list of text into word sequence
      pptext = ' '.join(pptext)
      w = open('tfpp/'+child,'w',encoding='utf-8')
      w.write(pptext)

In [3]:
#Q3- Finding Vocab Count
parent_list = os.listdir('tfpp')
vocab=[]
for child in parent_list:
    f=open('tfpp/'+child,'r',encoding='utf-8')
    vocab.extend(re.sub(r'\.',' ',f.read()).split())#Add all words to a list, then use hashtable to form a set
vocab=Counter(vocab)
print('Vocab length is: '+str(len(vocab)))

Vocab length is: 1163688


In [112]:
#Q-4- Building Bigram Model
parent_list=os.listdir('tfpp')
bigram_freq=Counter()

for child in parent_list:
    f=open('tfpp/'+child,'r')
    sents=f.read().split('.')
    for sentence in sents:
        words=sentence.split()
        bigram_freq.update(bigrams(words))# Frequencies are kept, Probabilities are calculated only when required

NameError: name 'pickle' is not defined

In [4]:
#Building Trigram Model
trigram_freq=Counter()
for child in parent_list:
    f=open('tfpp/'+child,'r')
    sents=f.read().split('.')
    for sentence in sents:
        words=sentence.split()
        trigram_freq.update(trigrams(words))#Bigrams and Trigrams are functions from nltk library

In [5]:
with open('trigram_model.pickle','wb') as f:#Using pickle to store the model
    pickle.dump(trigram_freq,f)

In [116]:
with open('bigram_model.pickle','wb') as f:#Using pickle to store the model
    pickle.dump(bigram_freq,f)

In [9]:
with open('bigram_model.pickle','rb') as f:
    bigram_freq = pickle.load(f)
with open('trigram_model.pickle','rb') as f:
    trigram_freq = pickle.load(f)
def prob_words_bi(atuple):#P(w(i+1)|w(i))
    if atuple[1] not in stopwords:#increasing quality of recommendations
        return (bigram_freq[atuple] + 1) / (vocab[atuple[0]] + len(vocab))#Laplacian smoothing
    else:
        return 1/(vocab[atuple[0]] + len(vocab))
def pred_next_bigram(word):#Predicts Next Word
    candidates=[i for i in bigram_freq if i[0]==word]
    mydict={}#Use dictionary to store all candidate tuples
    for i in candidates:
        mydict[i] = prob_words_bi(i)
    return max(mydict, key= lambda x: mydict[x])[1]
def prob_words_tri(atuple):#P(w(i+2)|w(i+1),w(i))
    if atuple[2] not in stopwords:
        return (trigram_freq[atuple]+1) / (bigram_freq[atuple[:2]]+len(vocab))
    else:
        return 1/(bigram_freq[atuple[:2]]+len(vocab))
def pred_next_trigram(atuple):
    candidates=[i for i in trigram_freq if i[:2]==atuple]
    mydict={}
    for i in candidates:
        mydict[i] = prob_words_tri(i)
    return max(mydict, key = lambda x: mydict[x])[2]

In [38]:
def ten_probable_bi(word):#10 most probable next words for word, bigram
    candidates=[i for i in bigram_freq if i[0]==word]
    mydict={}
    for i in candidates:
        mydict[i]=prob_words_bi(i)
    return sorted(mydict, key= lambda x: mydict[x],reverse=True)[:10]
def ten_probable_tri(atuple):#for trigrams
    candidates=[i for i in trigram_freq if i[:2]==atuple]
    mydict={}
    for i in candidates:
        mydict[i]=prob_words_tri(i)
    return sorted(mydict, key= lambda x: mydict[x],reverse=True)[:10]

In [7]:
def perplexity_bi(sentence):#perplexity for bigrams 
    words=sentence.split()
    n=len(words)
    words=list(bigrams(words))
    ppscore=1
    for i in words:
        ppscore *= prob_words_bi(i)
    return (1/ppscore)**(1/n)
def perplexity_tri(sentence):#perplexity for trigrams
    words=sentence.split()
    n=len(words)
    words=list(trigrams(words))
    ppscore = 1
    for i in words:
        ppscore *= prob_words_tri(i)
    return (1/ppscore)**(1/n)

In [10]:
def pred_missing_text_bi(sentence):#predicts missing text wherever '%%' is used, for bigrams
    words=sentence.split()
    for i in range(len(words)):
        if words[i]==r'%%':
            words[i]=pred_next_bigram(words[i-1])
    return ' '.join(words)
def pred_missing_text_tri(sentence):#for trigrams
    words=sentence.split()
    for i in range(len(words)):
        if words[i]==r'%%':
            words[i]=pred_next_trigram((words[i-2],words[i-1]))
    return ' '.join(words)

In [30]:
#Q5 find missing text: bigrams
sen1=r'all houses were %% ventilated'
sen2=r'it aims to develop an integrated %% to reach mmps exposed to malaria with prevention diagnosis and treatment %% by involving non- health %% stakeholders from provincial to community level'
sen3=r'this is because engineers do not work in %% but rather as a team'
print('Sentence 1: '+pred_missing_text_bi(sen1))
print('Sentence 2: '+pred_missing_text_bi(sen2))
print('Sentence 3: '+pred_missing_text_bi(sen3))

Sentence 1: all houses were not ventilated
Sentence 2: it aims to develop an integrated dna to reach mmps exposed to malaria with prevention diagnosis and treatment group by involving non- health care stakeholders from provincial to community level
Sentence 3: this is because engineers do not work in addition but rather as a team


In [31]:
#Q5 find missing text: trigrams
print('Sentence 1: '+pred_missing_text_tri(sen1))
print('Sentence 2: '+pred_missing_text_tri(sen2))
print('Sentence 3: '+pred_missing_text_tri(sen3))

Sentence 1: all houses were made ventilated
Sentence 2: it aims to develop an integrated approach to reach mmps exposed to malaria with prevention diagnosis and treatment strategies by involving non- health care stakeholders from provincial to community level
Sentence 3: this is because engineers do not work in ensuring but rather as a team


In [45]:
#Q5: top 10 words: bigrams
print('Sentence 1: '+str(ten_probable_bi('were'))+'\n')
print('Sentence 2 word 1: '+str(ten_probable_bi('integrated'))+'\n')
print('sentence 2 word 2: '+str(ten_probable_bi('treatment'))+'\n')
print("sentence 2 word 3: "+str(ten_probable_bi('health'))+'\n')
print('Sentence 3: '+str(ten_probable_bi('in')))

Sentence 1: [('were', 'not'), ('were', 'used'), ('were', 'also'), ('were', 'performed'), ('were', 'found'), ('were', 'collected'), ('were', 'obtained'), ('were', 'observed'), ('were', 'identified'), ('were', 'detected')]

Sentence 2 word 1: [('integrated', 'dna'), ('integrated', 'approach'), ('integrated', 'care'), ('integrated', 'moving'), ('integrated', 'system'), ('integrated', 'model'), ('integrated', 'analysis'), ('integrated', 'health'), ('integrated', 'management'), ('integrated', 'within')]

sentence 2 word 2: [('treatment', 'group'), ('treatment', 'options'), ('treatment', 'or'), ('treatment', 'groups'), ('treatment', 'strategies'), ('treatment', 'were'), ('treatment', 'should'), ('treatment', 'has'), ('treatment', 'may'), ('treatment', 'option')]

sentence 2 word 3: [('health', 'care'), ('health', 'organization'), ('health', 'system'), ('health', 'services'), ('health', 'systems'), ('health', 'emergency'), ('health', 'professionals'), ('health', 'authorities'), ('health', 'st

In [47]:
#Q5: top 10 words: trigrams
print('Sentence 1: '+str(ten_probable_tri(('houses','were')))+'\n')
print('Sentence 2 word 1: '+str(ten_probable_tri(('an','integrated')))+'\n')
print('sentence 2 word 2: '+str(ten_probable_tri(('and','treatment')))+'\n')
print("sentence 2 word 3: "+str(ten_probable_tri(('non-','health')))+'\n')
print('Sentence 3: '+str(ten_probable_tri(('work','in'))))

Sentence 1: [('houses', 'were', 'made'), ('houses', 'were', 'constructed'), ('houses', 'were', 'built'), ('houses', 'were', 'investigated'), ('houses', 'were', 'malaria'), ('houses', 'were', 'contacted'), ('houses', 'were', 'tested'), ('houses', 'were', 'then'), ('houses', 'were', 'no'), ('houses', 'were', 'not')]

Sentence 2 word 1: [('an', 'integrated', 'approach'), ('an', 'integrated', 'system'), ('an', 'integrated', 'model'), ('an', 'integrated', 'analysis'), ('an', 'integrated', 'framework'), ('an', 'integrated', 'health'), ('an', 'integrated', 'platform'), ('an', 'integrated', 'part'), ('an', 'integrated', 'view'), ('an', 'integrated', 'one')]

sentence 2 word 2: [('and', 'treatment', 'strategies'), ('and', 'treatment', 'options'), ('and', 'treatment', 'are'), ('and', 'treatment', 'plan'), ('and', 'treatment', 'protocol'), ('and', 'treatment', 'guidelines'), ('and', 'treatment', 'protocols'), ('and', 'treatment', 'services'), ('and', 'treatment', 'program'), ('and', 'treatment', 

In [48]:
#Q6: perplexity score:bigrams
sena='it appears that the overall code stroke volume has decreased since the covid- pandemic'
senb='half a century ago hypertension was not treatable'
senc='sarahs tv is broadcasting an advert for private healthcare'
print('Sentence 1 perplexity score: '+ str(perplexity_bi(sena)))
print('Sentence 2 perplexity score: '+ str(perplexity_bi(senb)))
print('Sentence 3 perplexity score: '+str(perplexity_bi(senc)))

Sentence 1 perplexity score: 3585.683485011883
Sentence 2 perplexity score: 9957.038437205303
Sentence 3 perplexity score: 58126.10000276572


In [49]:
#Q6: perplexity score: trigrams
print('Sentence 1 perplexity score: '+ str(perplexity_tri(sena)))
print('Sentence 2 perplexity score: '+ str(perplexity_tri(senb)))
print('Sentence 3 perplexity score: '+str(perplexity_tri(senc)))

Sentence 1 perplexity score: 9921.30537874307
Sentence 2 perplexity score: 5682.088884112009
Sentence 3 perplexity score: 35534.18736702119
