In [30]:
import math
import os
import urllib.request
import zipfile
from collections import Counter
from tqdm import tqdm
from pathlib import Path
import pathlib
import nltk
from nltk import bigrams

In [31]:
data_dir = 'MIR2-data'
with zipfile.ZipFile('{}.zip'.format(data_dir), 'r') as zip_fh:
    zip_fh.extractall()
print('Data unzipped to {}...\n'.format(data_dir))
print('Directory Structure:')
print(data_dir + os.path.sep)
for sub_dir in os.listdir(data_dir):
    if not sub_dir.startswith('.'):
        print('  - ' + sub_dir + os.path.sep)

Data unzipped to MIR2-data...

Directory Structure:
MIR2-data/
  - dev_set/
  - corpus/
  - training_set/


In [32]:
class LanguageModel:
    def __init__(self, lambda_=0.1):
        data_dir = 'MIR2-data/corpus'
        self.lambda_ = lambda_
        self.total_num_tokens = 0 
        self.unigram_counts = Counter()
        self.bigram_counts = Counter()
        for filenames in sorted(os.listdir(data_dir)):
            with open(os.path.join(data_dir, filenames)) as myfile:
                for line in myfile:
                    corpus_token = line.split()
                    self.total_num_tokens += len(corpus_token)
                    self.unigram_counts.update(corpus_token)
                    bigram = list(nltk.bigrams(corpus_token))
                    self.bigram_counts.update(bigram)

In [40]:
class LanguageModel(LanguageModel):
    def get_unigram_logp(self, unigram):
        numerator = self.unigram_counts.get(unigram)
        denominator = self.total_num_tokens
        if numerator is None: 
            return 0
        pmle_uni = math.log(float(numerator) / float(denominator))
        return pmle_uni
    def get_bigram_logp(self, w_1, w_2):
        numerator = self.bigram_counts.get((w_1, w_2))
        denominator = self.unigram_counts.get(w_1)
        pmle_bi = 0
        if numerator is not None:
            pmle_bi = math.log(float(numerator) / float(denominator))
        p_bi = self.lambda_ * (self.get_unigram_logp(w_2)) + (1- self.lambda_)* pmle_bi
        return p_bi
    def get_query_logp(self, query):
        query = query.split()
        p_q = 0
        p_w_0 =  self.get_unigram_logp(query[0])
        if len(query) == 1:
            return p_w_0
        for i in range(0, len(query)-1):
            p_query = self.get_bigram_logp(query[i], query[i+1])
            print(p_query, query[i], query[i+1])
            p_q += p_query
        return p_q
        
        
           

In [42]:
lm = LanguageModel()

print('num. unigrams("{}")'.format(len(lm.unigram_counts))) 
print('num. bigrams("{}")'.format(len(lm.bigram_counts)))
print('num. tokens("{}")'.format(lm.total_num_tokens))


# Test a reasonable query with and without typos (you should try your own)!
query_wo_typo = "sharif university" # write a query without typo
query_w_typo = "sharaf university"  # write a query with typo

p_wo_typo = math.exp(lm.get_query_logp(query_wo_typo))
p_w_typo = math.exp(lm.get_query_logp(query_w_typo))
print('P("{}") == {}'.format(query_wo_typo, p_wo_typo))
print('P("{}") == {}'.format(query_w_typo, p_w_typo))
if p_wo_typo <= p_w_typo:
    print('Are you sure "{}" should be assigned higher probability than "{}"?'
          .format(query_w_typo, query_wo_typo))
    
print("done!")

num. unigrams("347071")
num. bigrams("4454471")
num. tokens("25498340")
-0.8908602215578134 sharif university
-0.5259416242604654 sharaf university
P("sharif university") == 0.41030264971618385
P("sharaf university") == 0.5909986036551534
Are you sure "sharaf university" should be assigned higher probability than "sharif university"?
done!


In [24]:
class BaseEditProbabilityModel:
    def get_edit_logp(self, edited, original):
       # edited_word = edit_function(edited, original)
       # print(edited_word)
        self.edited = edited
        self.original = original
        edit_obj = LanguageModel()
        prob = edit_obj.get_bigram_logp(self.edited, self.original)
        return prob 
       

In [21]:
#epm =  BaseEditProbabilityModel()  # You will define such a subclass later
#original = 'user'
#edited = 'usre'                      # Edited by transposing 'r' and 'e'
#score = epm.get_edit_logp(edited, original)
#print(score)

In [25]:
class EditProbabilityModelSubclass(BaseEditProbabilityModel):
    def edit_function(self,str_1 , str_2):
        str_1 = self.edited
        str_2 = self.original
        a = len(str_1)+1
        b = len(str_2)+1
        d = [[0 for x in range(a)] for x in range(b)]
        for i in range(1, a):
            d[i][0] = i
        for i in range(1, b):
             d[0][i] = i
        for i in range(1, a):
            for j in range(1, b):
                if str_1[i-1] == str_2[j-1]:
                    cost = 0
                else:
                    cost = 1
                d[i][j] = min(d[i-1][j] + 1,     # deletion
                               d[i][j-1] + 1,     # insertion
                               d[i-1][j-1] + cost)  #substitution
                if i>1 and j>1 and str_1[i-1] == str_2[j-2] and str_1[i-2] == str_2[j-1]:
                        d[i][j] = min(d[i][j],
                                   d[i-2][j-2] + 1)  #transposition
        return d[i][j]
    
    

In [26]:
epm = EditProbabilityModelSubclass()  # You will define such a subclass later
original = 'user'
edited = 'usre'                      # Edited by transposing 'r' and 'e'
score = epm.get_edit_logp(edited, original)
print(score)

5.688213428795757e-05


In [27]:
class UniformEditProbabilityModel(BaseEditProbabilityModel):
    def __init__(self, edit_prob):
        self.edit_prob = edit_prob
    def get_edit_logp(self, edited, original):
        edit_obj = LanguageModel()
        prob = edit_obj.get_bigram_logp(edited,original)
        return prob 
       
        

In [28]:
EDIT_PROB = 0.01
epm = UniformEditProbabilityModel(edit_prob=EDIT_PROB)
edited, original = 'usre', 'user'
print(math.isclose(epm.get_edit_logp(edited, original), math.log(EDIT_PROB)))
print(math.isclose(epm.get_edit_logp(original, original), math.log(1. - EDIT_PROB)))



False
False
