In [97]:
import pandas as pd
import numpy as np
import math
from math import log10, sqrt

# Query/Document Vector

In [108]:
D0="preliminary findings in corona research"
D1="novel corona research findings"
D2="new research to corona healing"
D3="healing novel corona research"
q="novel novel preliminary new research"

corpus=[D0,D1,D2,D3,q]
word_bag= set(D0.split()+D1.split()+D2.split()+D3.split()+q.split())

In [109]:
def term_frequency(corpus,word_bag):
    tf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','q'])
    for doc,col in zip(corpus,tf_df.columns):
        for word in word_bag:
            tf_df[col][word] = doc.count(word)
                
    return tf_df

In [110]:
tf_df = term_frequency(corpus,word_bag)
tf_df

Unnamed: 0,d0,d1,d2,d3,q
corona,1,1,1,1,0
preliminary,1,0,0,0,1
research,1,1,1,1,1
to,0,0,1,0,0
new,0,0,1,0,1
healing,0,0,1,1,0
in,4,2,1,1,1
novel,0,1,0,1,2
findings,1,1,0,0,0


In [111]:
tf_df['d0'][6]=1
tf_df['d1'][6]=0
tf_df['d2'][6]=0
tf_df['d3'][6]=0
tf_df['q'][6]=0
tf_df

Unnamed: 0,d0,d1,d2,d3,q
corona,1,1,1,1,0
preliminary,1,0,0,0,1
research,1,1,1,1,1
to,0,0,1,0,0
new,0,0,1,0,1
healing,0,0,1,1,0
in,1,0,0,0,0
novel,0,1,0,1,2
findings,1,1,0,0,0


In [119]:

def document_frequency(corpus,word_bag):
    
    for word in word_bag:
        for doc in corpus[-1]:#[:-1]: #excluding q else just corpus
            if word not in document_frequencies.keys():
                document_frequencies[word]=1
            elif word in doc:
                document_frequencies[word]+=1

In [120]:
document_frequencies = {}
document_frequency(corpus,word_bag)
document_frequencies

{'corona': 1,
 'preliminary': 1,
 'research': 1,
 'to': 1,
 'new': 1,
 'healing': 1,
 'in': 1,
 'novel': 1,
 'findings': 1}

In [87]:
#document_frequencies['jar'] = 4
#document_frequencies

In [92]:
tf_idf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','d4','q'])

def tf_idf():
    for word in word_bag:
        tf_idf_df.loc[word] = tf_df.loc[word]*(log10((len(corpus)-1)/document_frequencies[word])) #len(corpus)-1 for query

In [93]:
tf_idf()
tf_idf_df

Unnamed: 0,d0,d1,d2,d3,d4,q
coffee,0.443697,0.0,0.221849,0.665546,0.0,0.0
cup,0.0,0.221849,0.443697,0.665546,0.0,0.221849
water,0.0,0.0,0.0,0.0,1.39794,0.0
jar,0.0,0.19382,0.09691,0.29073,0.19382,0.09691
tea,0.0,0.79588,0.0,0.39794,0.0,0.0


# Rocchio Feedback

In [9]:
#Rocchio feedback
def rocchio(alpha,beta,gamma):
    q_updates = (alpha*tf_idf_df['q']) + (beta*tf_idf_df['d1']) - (gamma*tf_idf_df['d3'])
    return q_updates

In [10]:
rocchio(1,0.8,0.1)

dogs      0.087457
rats       0.30103
behind    0.212396
cats           0.0
runs           0.0
dtype: object

In [11]:
rocchio(1,0.1,0.9)

dogs     -0.099951
rats       0.30103
behind    0.024988
cats           0.0
runs           0.0
dtype: object

In [12]:
rocchio(1,1,1)

dogs           0.0
rats       0.30103
behind    0.124939
cats           0.0
runs           0.0
dtype: object

# Practice

In [121]:
d0 = "preliminary findings in corona research"
d1 = "novel corona research findings"
d2 = "new research to corona healing"
d3 = "healing novel corona research"
query = "novel novel preliminary new research"
D={d0, d1, d2, d3}
vocab = {"preliminary", "findings", "in", "corona", "research", "novel",
"new", "to", "healing"}

In [122]:
def term_frequency2(vocab):
    d0_dict = dict.fromkeys(vocab,0)
    d1_dict = dict.fromkeys(vocab,0)
    d2_dict = dict.fromkeys(vocab,0)
    d3_dict = dict.fromkeys(vocab,0)
    query_dict = dict.fromkeys(vocab,0)
    
    for word in d0.split():
        d0_dict[word]+=1
    for word in d1.split():
        d1_dict[word]+=1
    for word in d2.split():
        d2_dict[word]+=1
    for word in d3.split():
        d3_dict[word]+=1
    for word in query.split():
        query_dict[word]+=1
    
    return pd.DataFrame([d0_dict,d1_dict,d2_dict,d3_dict,query_dict], index = ['d0','d1','d2','d3','query']).transpose()

In [123]:
df = term_frequency2(vocab)
df

Unnamed: 0,d0,d1,d2,d3,query
corona,1,1,1,1,0
preliminary,1,0,0,0,1
research,1,1,1,1,1
to,0,0,1,0,0
new,0,0,1,0,1
healing,0,0,1,1,0
in,1,0,0,0,0
novel,0,1,0,1,2
findings,1,1,0,0,0


In [125]:
#Document Frequency
df['docfreq'] = [4,1,4,1,1,2,1,2,2]
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq
corona,1,1,1,1,0,4
preliminary,1,0,0,0,1,1
research,1,1,1,1,1,4
to,0,0,1,0,0,1
new,0,0,1,0,1,1
healing,0,0,1,1,0,2
in,1,0,0,0,0,1
novel,0,1,0,1,2,2
findings,1,1,0,0,0,2


In [17]:
#global weights
df['global_weights'] = np.log10((len(D))/df['docfreq'])
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq,global_weights
new,0,0,1,0,1,1,0.60206
research,1,1,1,1,1,4,0.0
preliminary,1,0,0,0,1,1,0.60206
to,0,0,1,0,0,1,0.60206
novel,0,1,0,1,2,2,0.30103
findings,1,1,0,0,0,2,0.30103
healing,0,0,1,1,0,2,0.30103
in,1,0,0,0,0,1,0.60206
corona,1,1,1,1,0,4,0.0


In [18]:
#query vector
df['d0_tfidf'] = df['global_weights']*df['d0']
df['d1_tfidf'] = df['global_weights']*df['d1']
df['d2_tfidf'] = df['global_weights']*df['d2']
df['d3_tfidf'] = df['global_weights']*df['d3']
df['query_tfidf'] = df['global_weights']*df['query']
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq,global_weights,d0_tfidf,d1_tfidf,d2_tfidf,d3_tfidf,query_tfidf
new,0,0,1,0,1,1,0.60206,0.0,0.0,0.60206,0.0,0.60206
research,1,1,1,1,1,4,0.0,0.0,0.0,0.0,0.0,0.0
preliminary,1,0,0,0,1,1,0.60206,0.60206,0.0,0.0,0.0,0.60206
to,0,0,1,0,0,1,0.60206,0.0,0.0,0.60206,0.0,0.0
novel,0,1,0,1,2,2,0.30103,0.0,0.30103,0.0,0.30103,0.60206
findings,1,1,0,0,0,2,0.30103,0.30103,0.30103,0.0,0.0,0.0
healing,0,0,1,1,0,2,0.30103,0.0,0.0,0.30103,0.30103,0.0
in,1,0,0,0,0,1,0.60206,0.60206,0.0,0.0,0.0,0.0
corona,1,1,1,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0


# Cosine Similarity

In [19]:
import math
def magnitude(vector): 
    return math.sqrt(sum(pow(element, 2) for element in vector))

In [20]:
#cosine similarity

def cos(query,document):
    return np.dot(query,document)/(magnitude(query) * magnitude(document))
    

#np.dot(df['query_tfidf'],df['d3_tfidf'])/(magnitude(df['query_tfidf'] * magnitude(df['d3_tfidf'])))

In [21]:
#query to doc 3
cos(df['query_tfidf'],df['d3_tfidf'])

0.4082482904638631

In [22]:
cos(df['query_tfidf'],df['d2_tfidf'])

0.3849001794597505

In [23]:
cos(df['query_tfidf'],df['d1_tfidf'])

0.4082482904638631

In [24]:
cos(df['query_tfidf'],df['d0_tfidf'])

0.3849001794597505

In [28]:
magnitude(df['query_tfidf'])

1.0427984941845085

# T-D Matrix

In [6]:
D0="coffee coffee"
D1="tea cup jar jar tea"
D2="cup coffee jar cup"
D3="coffee cup coffee jar tea cup coffee jar cup jar"
D4 = 'jar water water jar'
q="behind rats"

corpus=[D0,D1,D2,D3,D4,q]
word_bag= set(D0.split()+D1.split()+D2.split()+D3.split()+D4.split()+q.split())

In [11]:
def td_matrix(corpus,word_bag):
    td = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','d4','q'])
    for doc,col in zip(corpus,td.columns):
        for word in word_bag:
            if((doc.count(word))>=1):
                td[col][word]=1
            else:
                td[col][word]=0
                
    return td

In [12]:
td_matrix(corpus,word_bag)

Unnamed: 0,d0,d1,d2,d3,d4,q
coffee,1,0,1,1,0,0
cup,0,1,1,1,0,0
behind,0,0,0,0,0,1
rats,0,0,0,0,0,1
water,0,0,0,0,1,0
jar,0,1,1,1,1,0
tea,0,1,0,1,0,0
