In [1]:
import pandas as pd
import numpy as np


d0 = "cats runs behind rats"
d1 = "dogs runs behind cats"
d2 = "rats runs cats"
d3 ="behind runs cats dogs"

corpus=[d0,d1,d2,d3]

word_bag= set(d0.split()+d1.split()+d2.split()+d3.split())

In [2]:
def term_frequency(corpus,word_bag):
    tf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3'])
    for doc,col in zip(corpus,tf_df.columns):
        for word in word_bag:
            tf_df[col][word] = doc.count(word)
                
    return tf_df

In [3]:
df = term_frequency(corpus,word_bag)
df

Unnamed: 0,d0,d1,d2,d3
rats,1,0,1,0
dogs,0,1,0,1
runs,1,1,1,1
behind,1,1,0,1
cats,1,1,1,1


# P_Mc(t) : probability of the term appearing in the whole corpus

In [4]:
len_corpus = len(d0.split()) + len(d1.split()) + len(d2.split()) + len(d3.split())
len_corpus

15

In [12]:
#change axis to 0 and then again to 1 to get correct values
df['P_Mc(t)'] = df.sum(axis=1)/len_corpus
df

Unnamed: 0,d0,d1,d2,d3,P_Mc(t)
rats,1,0,1,0,0.133333
dogs,0,1,0,1,0.133333
runs,1,1,1,1,0.266667
behind,1,1,0,1,0.2
cats,1,1,1,1,0.266667


# PMd(t): the probability of term occurring in the document

In [13]:
df['PMd0(t)'] = df['d0']/(len(d0.split()))
df['PMd1(t)'] = df['d1']/(len(d1.split()))
df['PMd2(t)'] = df['d2']/(len(d2.split()))
df['PMd3(t)'] = df['d3']/(len(d3.split()))
df

Unnamed: 0,d0,d1,d2,d3,P_Mc(t),PMd0(t),PMd1(t),PMd2(t),PMd3(t)
rats,1,0,1,0,0.133333,0.25,0.0,0.333333,0.0
dogs,0,1,0,1,0.133333,0.0,0.25,0.0,0.25
runs,1,1,1,1,0.266667,0.25,0.25,0.333333,0.25
behind,1,1,0,1,0.2,0.25,0.25,0.0,0.25
cats,1,1,1,1,0.266667,0.25,0.25,0.333333,0.25


#  A un-smoothed, unigram model: Puni(d | q) = ∏t∈q PMd(t)

In [14]:
def unsmoothed(q):
    d0 = d1 = d2 = d3 = 1
    for word in q.split():
        d0 = d0 * df['PMd0(t)'][word]
        d1 = d1 * df['PMd1(t)'][word]
        d2 = d2 * df['PMd2(t)'][word]
        d3 = d3 * df['PMd3(t)'][word]
        
    res = [d0,d1,d2,d3]
    return res

In [15]:
q = "behind rats"
sorted(unsmoothed(q),reverse = True)

[0.0625, 0.0, 0.0, 0.0]

In [16]:
q = "behind rats"
unsmoothed(q)

[0.0625, 0.0, 0.0, 0.0]

# linear-interpolated,uni : Pinterp-uni(d | q) = ∏t∈q [λ. PMd(t) + (1-λ). PMc(t)]

In [17]:
def linear(q,lamb):
    d0 = d1 = d2 = d3 = 1
    for word in q.split():
        d0 = d0 * ((lamb * df['PMd0(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d1 = d1 * ((lamb * df['PMd1(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d2 = d2 * ((lamb * df['PMd2(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d3 = d3 * ((lamb * df['PMd3(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
    res = [d0,d1,d2,d3]
    return res

In [18]:
q = "behind rats"
sorted(linear(q,0.5),reverse=True)

[0.043125, 0.023333333333333334, 0.015, 0.015]

In [19]:
q = "behind rats"
linear(q,0.5)

[0.043125, 0.015, 0.023333333333333334, 0.015]

# Bigram/Trigram

In [20]:
q0 = "cats runs behind rat"
𝑞1 = "dogs runs behind cats"
𝑞2 = "rats runs cats"
𝑞3 = "behind runs cats dogs"
𝑞4 = "behind rats"

corpus_single = q0.split()+q1.split()+q2.split()+q3.split()+q4.split()
corpus_n = q0+" "+q1+" "+q2+" "+q3+" "+q4

vocab = set(q0.split()+q1.split()+q2.split()+q3.split()+q4.split())
corpus_len = len(q0.split()+q1.split()+q2.split()+q3.split()+q4.split())


In [30]:
def puni(s):
    return corpus_single.count(s)/len_corpus
    

In [34]:
def pbi(s2,s1):
    if((corpus_single.count(s1))==0):
        return 0
    else:
        return corpus_n.count(s1+" "+s2)/corpus_single.count(s1)

In [35]:
def ptri(s3,s1,s2):
    if(corpus_n.count(s1+" "+s2)==0):
        return 0
    else:
        return corpus_n.count(s1+" "+s2+" "+s3)/corpus_n.count(s1+" "+s2)

In [36]:
def bigram(s1,s2):
    for i in vocab:
        p = puni(s1) * pbi(s2,s1) * pbi(i,s2)
        print(str(i) + ":" + str(p))

In [37]:
def trigram(s1,s2):
    for i in vocab:
        p = puni(s1) * pbi(s2,s1) * ptri(i,s1,s2)
        print(str(i) + ":" + str(p))

In [40]:
bigram('cats','runs')

rats:0.0
dogs:0.0
runs:0.0
behind:0.03333333333333333
rat:0.0
cats:0.03333333333333333


In [41]:
trigram('cats','runs')

rats:0.0
dogs:0.0
runs:0.0
behind:0.06666666666666667
rat:0.0
cats:0.0
