In [1]:
import pandas as pd
import numpy as np


d0 = "vaccine research corona virus research"
d1 = "research research cancer vaccine vaccine"
d2 = "virus virus corona vaccine lab"
d3 = "cancer lab research lab"

corpus=[d0,d1,d2,d3]

word_bag= set(d0.split()+d1.split()+d2.split()+d3.split())

In [2]:
def term_frequency(corpus,word_bag):
    tf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3'])
    for doc,col in zip(corpus,tf_df.columns):
        for word in word_bag:
            tf_df[col][word] = doc.count(word)
                
    return tf_df

In [3]:
df = term_frequency(corpus,word_bag)
df

Unnamed: 0,d0,d1,d2,d3
vaccine,1,2,1,0
cancer,0,1,0,1
virus,1,0,2,0
research,2,2,0,1
lab,0,0,1,2
corona,1,0,1,0


# P_Mc(t) : probability of the term appearing in the whole corpus

In [4]:
len_corpus = len(d0.split()) + len(d1.split()) + len(d2.split()) + len(d3.split())
len_corpus

19

In [5]:
df['P_Mc(t)'] = df.sum(axis=1)/19
df

Unnamed: 0,d0,d1,d2,d3,P_Mc(t)
vaccine,1,2,1,0,0.210526
cancer,0,1,0,1,0.105263
virus,1,0,2,0,0.157895
research,2,2,0,1,0.263158
lab,0,0,1,2,0.157895
corona,1,0,1,0,0.105263


# PMd(t): the probability of term occurring in the document

In [6]:
df['PMd0(t)'] = df['d0']/(len(d0.split()))
df['PMd1(t)'] = df['d1']/(len(d1.split()))
df['PMd2(t)'] = df['d2']/(len(d2.split()))
df['PMd3(t)'] = df['d3']/(len(d3.split()))
df

Unnamed: 0,d0,d1,d2,d3,P_Mc(t),PMd0(t),PMd1(t),PMd2(t),PMd3(t)
vaccine,1,2,1,0,0.210526,0.2,0.4,0.2,0.0
cancer,0,1,0,1,0.105263,0.0,0.2,0.0,0.25
virus,1,0,2,0,0.157895,0.2,0.0,0.4,0.0
research,2,2,0,1,0.263158,0.4,0.4,0.0,0.25
lab,0,0,1,2,0.157895,0.0,0.0,0.2,0.5
corona,1,0,1,0,0.105263,0.2,0.0,0.2,0.0


#  A un-smoothed, unigram model: Puni(d | q) = ∏t∈q PMd(t)

In [7]:
def unsmoothed(q):
    d0 = d1 = d2 = d3 = 1
    for word in q.split():
        d0 = d0 * df['PMd0(t)'][word]
        d1 = d1 * df['PMd1(t)'][word]
        d2 = d2 * df['PMd2(t)'][word]
        d3 = d3 * df['PMd3(t)'][word]
        
    res = [d0,d1,d2,d3]
    return res

In [8]:
q = "vaccine vaccine research cancer"
sorted(unsmoothed(q),reverse = True)

[0.012800000000000004, 0.0, 0.0, 0.0]

In [9]:
q = "vaccine research"
unsmoothed(q)

[0.08000000000000002, 0.16000000000000003, 0.0, 0.0]

# linear-interpolated,uni : Pinterp-uni(d | q) = ∏t∈q [λ. PMd(t) + (1-λ). PMc(t)]

In [10]:
def linear(q,lamb):
    d0 = d1 = d2 = d3 = 1
    for word in q.split():
        d0 = d0 * ((lamb * df['PMd0(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d1 = d1 * ((lamb * df['PMd1(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d2 = d2 * ((lamb * df['PMd2(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
        d3 = d3 * ((lamb * df['PMd3(t)'][word]) + ((1-lamb) * df['P_Mc(t)'][word]))
    res = [d0,d1,d2,d3]
    return res

In [11]:
q = "vaccine vaccine research cancer"
sorted(linear(q,0.5),reverse=True)

[0.004716068784002579,
 0.0007352844131030303,
 0.0005050030309773557,
 0.000291779529009139]

In [12]:
q = "vaccine research"
linear(q,0.5)

[0.06806094182825485,
 0.10121883656509696,
 0.027008310249307478,
 0.027008310249307475]

# Bigram/Trigram

In [1]:
q0 = "regression weak classification intelligence kernel"
𝑞1 = "network weights weak classification"
𝑞2 = "regression weak tangent"
𝑞3 = "weak classification artificial"
𝑞4 = "regression weak tangent intelligence"

corpus_single = q0.split()+q1.split()+q2.split()+q3.split()+q4.split()
corpus_n = q0+" "+q1+" "+q2+" "+q3+" "+q4

vocab = set(q0.split()+q1.split()+q2.split()+q3.split()+q4.split())
corpus_len = len(q0.split()+q1.split()+q2.split()+q3.split()+q4.split())


In [2]:
def puni(s):
    return corpus_single.count(s)/corpus_len
    

In [3]:
def pbi(s2,s1):
    if((corpus_single.count(s1))==0):
        return 0
    else:
        return corpus_n.count(s1+" "+s2)/corpus_single.count(s1)

In [4]:
def ptri(s3,s1,s2):
    if(corpus_n.count(s1+" "+s2)==0):
        return 0
    else:
        return corpus_n.count(s1+" "+s2+" "+s3)/corpus_n.count(s1+" "+s2)

In [5]:
def bigram(s1,s2):
    for i in vocab:
        p = puni(s1) * pbi(s2,s1) * pbi(i,s2)
        print(str(i) + ":" + str(p))

In [6]:
def trigram(s1,s2):
    for i in vocab:
        p = puni(s1) * pbi(s2,s1) * ptri(i,s1,s2)
        print(str(i) + ":" + str(p))

In [7]:
bigram('regression','weak')

tangent:0.06315789473684211
weak:0.0
regression:0.0
network:0.0
intelligence:0.0
weights:0.0
kernel:0.0
classification:0.09473684210526315
artificial:0.0


In [8]:
trigram('regression','weak')

tangent:0.10526315789473684
weak:0.0
regression:0.0
network:0.0
intelligence:0.0
weights:0.0
kernel:0.0
classification:0.05263157894736842
artificial:0.0
