In [13]:
import pandas as pd
import numpy as np
from math import log10, sqrt

# Query/Document Vector

In [14]:
D0="linear venn venn artificial"
D1="linear artificial scikit artificial regression artificial"
D2="scikit regression intelligence regression"
D3="artificial venn tandem intelligence artificial"
D4 = "regression scikit regression"
q="scikit regression artificial"

corpus=[D0,D1,D2,D3,D4,q]
word_bag= set(D0.split()+D1.split()+D2.split()+D3.split()+D4.split()+q.split())

In [15]:
def term_frequency(corpus,word_bag):
    tf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','d4','q'])
    for doc,col in zip(corpus,tf_df.columns):
        for word in word_bag:
            tf_df[col][word] = doc.count(word)
                
    return tf_df

In [16]:
tf_df = term_frequency(corpus,word_bag)
tf_df

Unnamed: 0,d0,d1,d2,d3,d4,q
artificial,1,3,0,2,0,1
scikit,0,1,1,0,1,1
venn,2,0,0,1,0,0
linear,1,1,0,0,0,0
intelligence,0,0,1,1,0,0
tandem,0,0,0,1,0,0
regression,0,1,2,0,2,1


In [19]:
document_frequencies = {}
def document_frequency(corpus,word_bag):
    
    for word in word_bag:
        for doc in corpus[:-1]: #excludingb q
            if word not in document_frequencies.keys():
                document_frequencies[word]=1
            elif word in doc:
                document_frequencies[word]+=1

In [20]:
document_frequency(corpus,word_bag)
document_frequencies


{'artificial': 3,
 'scikit': 4,
 'venn': 2,
 'linear': 2,
 'intelligence': 3,
 'tandem': 2,
 'regression': 4}

In [22]:
document_frequencies['scikit'] = 3
document_frequencies['intelligence'] = 2
document_frequencies['tandem'] = 1
document_frequencies['regression'] = 3
document_frequencies

{'artificial': 3,
 'scikit': 3,
 'venn': 2,
 'linear': 2,
 'intelligence': 2,
 'tandem': 1,
 'regression': 3}

In [23]:
tf_idf_df = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','d4','q'])

def tf_idf():
    for word in word_bag:
        tf_idf_df.loc[word] = tf_df.loc[word]*(log10((len(corpus)-1)/document_frequencies[word]))

In [24]:
tf_idf()
tf_idf_df

Unnamed: 0,d0,d1,d2,d3,d4,q
artificial,0.221849,0.665546,0.0,0.443697,0.0,0.221849
scikit,0.0,0.221849,0.221849,0.0,0.221849,0.221849
venn,0.79588,0.0,0.0,0.39794,0.0,0.0
linear,0.39794,0.39794,0.0,0.0,0.0,0.0
intelligence,0.0,0.0,0.39794,0.39794,0.0,0.0
tandem,0.0,0.0,0.0,0.69897,0.0,0.0
regression,0.0,0.221849,0.443697,0.0,0.443697,0.221849


In [28]:
#idf
log10((len(corpus)-1))/document_frequencies['tandem']

0.6989700043360189

# Rocchio Feedback

In [9]:
#Rocchio feedback
def rocchio(alpha,beta,gamma):
    q_updates = (alpha*tf_idf_df['q']) + (beta*tf_idf_df['d1']) - (gamma*tf_idf_df['d3'])
    return q_updates

In [10]:
rocchio(1,0.8,0.1)

dogs      0.087457
rats       0.30103
behind    0.212396
cats           0.0
runs           0.0
dtype: object

In [11]:
rocchio(1,0.1,0.9)

dogs     -0.099951
rats       0.30103
behind    0.024988
cats           0.0
runs           0.0
dtype: object

In [12]:
rocchio(1,1,1)

dogs           0.0
rats       0.30103
behind    0.124939
cats           0.0
runs           0.0
dtype: object

# Query/Document vector using df

In [13]:
d0 = "preliminary findings in corona research"
d1 = "novel corona research findings"
d2 = "new research to corona healing"
d3 = "healing novel corona research"
query = "novel novel preliminary new research"
D={d0, d1, d2, d3}
vocab = {"preliminary", "findings", "in", "corona", "research", "novel",
"new", "to", "healing"}

In [14]:
def term_frequency2(vocab):
    d0_dict = dict.fromkeys(vocab,0)
    d1_dict = dict.fromkeys(vocab,0)
    d2_dict = dict.fromkeys(vocab,0)
    d3_dict = dict.fromkeys(vocab,0)
    query_dict = dict.fromkeys(vocab,0)
    
    for word in d0.split():
        d0_dict[word]+=1
    for word in d1.split():
        d1_dict[word]+=1
    for word in d2.split():
        d2_dict[word]+=1
    for word in d3.split():
        d3_dict[word]+=1
    for word in query.split():
        query_dict[word]+=1
    
    return pd.DataFrame([d0_dict,d1_dict,d2_dict,d3_dict,query_dict], index = ['d0','d1','d2','d3','query']).transpose()

In [15]:
df = term_frequency2(vocab)
df

Unnamed: 0,d0,d1,d2,d3,query
new,0,0,1,0,1
research,1,1,1,1,1
preliminary,1,0,0,0,1
to,0,0,1,0,0
novel,0,1,0,1,2
findings,1,1,0,0,0
healing,0,0,1,1,0
in,1,0,0,0,0
corona,1,1,1,1,0


In [16]:
#Document Frequency
df['docfreq'] = df['d0'] + df['d1'] + df['d2'] + df['d3']
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq
new,0,0,1,0,1,1
research,1,1,1,1,1,4
preliminary,1,0,0,0,1,1
to,0,0,1,0,0,1
novel,0,1,0,1,2,2
findings,1,1,0,0,0,2
healing,0,0,1,1,0,2
in,1,0,0,0,0,1
corona,1,1,1,1,0,4


In [17]:
#global weights
df['global_weights'] = np.log10((len(D))/df['docfreq'])
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq,global_weights
new,0,0,1,0,1,1,0.60206
research,1,1,1,1,1,4,0.0
preliminary,1,0,0,0,1,1,0.60206
to,0,0,1,0,0,1,0.60206
novel,0,1,0,1,2,2,0.30103
findings,1,1,0,0,0,2,0.30103
healing,0,0,1,1,0,2,0.30103
in,1,0,0,0,0,1,0.60206
corona,1,1,1,1,0,4,0.0


In [18]:
#query vector
df['d0_tfidf'] = df['global_weights']*df['d0']
df['d1_tfidf'] = df['global_weights']*df['d1']
df['d2_tfidf'] = df['global_weights']*df['d2']
df['d3_tfidf'] = df['global_weights']*df['d3']
df['query_tfidf'] = df['global_weights']*df['query']
df

Unnamed: 0,d0,d1,d2,d3,query,docfreq,global_weights,d0_tfidf,d1_tfidf,d2_tfidf,d3_tfidf,query_tfidf
new,0,0,1,0,1,1,0.60206,0.0,0.0,0.60206,0.0,0.60206
research,1,1,1,1,1,4,0.0,0.0,0.0,0.0,0.0,0.0
preliminary,1,0,0,0,1,1,0.60206,0.60206,0.0,0.0,0.0,0.60206
to,0,0,1,0,0,1,0.60206,0.0,0.0,0.60206,0.0,0.0
novel,0,1,0,1,2,2,0.30103,0.0,0.30103,0.0,0.30103,0.60206
findings,1,1,0,0,0,2,0.30103,0.30103,0.30103,0.0,0.0,0.0
healing,0,0,1,1,0,2,0.30103,0.0,0.0,0.30103,0.30103,0.0
in,1,0,0,0,0,1,0.60206,0.60206,0.0,0.0,0.0,0.0
corona,1,1,1,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0


# Cosine Similarity(Relevance(d,q))

In [30]:
import math
def magnitude(vector): 
    return math.sqrt(sum(pow(element, 2) for element in vector))

In [31]:
#cosine similarity

def cos(query,document):
    return np.dot(query,document)/(magnitude(query) * magnitude(document))
    

#np.dot(df['query_tfidf'],df['d3_tfidf'])/(magnitude(df['query_tfidf'] * magnitude(df['d3_tfidf'])))

In [21]:
#query to doc 3
cos(df['query_tfidf'],df['d3_tfidf'])

0.4082482904638631

In [22]:
cos(df['query_tfidf'],df['d2_tfidf'])

0.3849001794597505

In [23]:
cos(df['query_tfidf'],df['d1_tfidf'])

0.4082482904638631

In [34]:
cos(tf_idf_df['q'],tf_idf_df['d4'])

0.7745966692414833

In [33]:
magnitude(tf_idf_df['q'])

0.38425330593115575

# T-D Matrix

In [1]:
D0="coffee coffee"
D1="tea cup jar jar tea"
D2="cup coffee jar cup"
D3="coffee cup coffee jar tea cup coffee jar cup jar"
D4 = 'jar water water jar'
q="behind rats"

corpus=[D0,D1,D2,D3,D4,q]
word_bag= set(D0.split()+D1.split()+D2.split()+D3.split()+D4.split()+q.split())

In [6]:
def td_matrix(corpus,word_bag):
    td = pd.DataFrame(index = list(word_bag), columns = ['d0','d1','d2','d3','d4','q'])
    for doc,col in zip(corpus,td.columns):
        for word in word_bag:
            if((doc.count(word))>=1):
                td[col][word]=1
            else:
                td[col][word]=0
                
    return td

In [7]:
td_matrix(corpus,word_bag)

Unnamed: 0,d0,d1,d2,d3,d4,q
rats,0,0,0,0,0,1
coffee,1,0,1,1,0,0
water,0,0,0,0,1,0
behind,0,0,0,0,0,1
tea,0,1,0,1,0,0
cup,0,1,1,1,0,0
jar,0,1,1,1,1,0
