# Vector Space Model
#### TF-IDF dictionary (checked with Elina's solution)

In [2]:
import pandas as pd
import math
import copy

In [4]:
# enter your path of the corpus
path = 'C:/Users/48668/Desktop/IR/project/nfcorpus/'

# load corpus as preprocessed set of documents
corpus = pd.read_csv(path + 'dev.docs', sep='\t', names=['ID', 'TEXT'])

# preview first rows
corpus.head()

Unnamed: 0,ID,TEXT
0,MED-118,alkylphenols human milk relations dietary habi...
1,MED-329,phosphate vascular toxin pubmed ncbi abstract ...
2,MED-330,dietary phosphorus acutely impairs endothelial...
3,MED-332,public health impact dietary phosphorus excess...
4,MED-334,differences total vitro digestible phosphorus ...


In [175]:
# create token list out of document
def tokenize(string):
    return string.split()

# apply term frequencies for each a single string (document)
def tf(string): 
    # create bag of words from the string
    bow = tokenize(string)
    
    tf_dict = {}
    for word in bow:
        if word in tf_dict:
            tf_dict[word] += 1
        else:
            tf_dict[word] = 1
            
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(bow)
    
    return tf_dict


# We then call our function on every doc and store all these tf dictionaries. 
tf_dict = {}
for index, row in corpus.iterrows():
    doc_dict = tf(row['TEXT'])
    tf_dict[index] = doc_dict

# test if tfDict was created correctly
#tf_dict[0]["alkylphenols"]
tf_dict

# alkylphenols for doc 0 : 0.008547008547008548

{0: {'alkylphenols': 0.008547008547008548,
  'human': 0.02564102564102564,
  'milk': 0.02564102564102564,
  'relations': 0.008547008547008548,
  'dietary': 0.017094017094017096,
  'habits': 0.017094017094017096,
  'central': 0.008547008547008548,
  'taiwan': 0.008547008547008548,
  'pubmed': 0.008547008547008548,
  'ncbi': 0.008547008547008548,
  'abstract': 0.008547008547008548,
  'aims': 0.008547008547008548,
  'study': 0.008547008547008548,
  'determine': 0.008547008547008548,
  'concentrations': 0.017094017094017096,
  'num': 0.13675213675213677,
  'nonylphenol': 0.008547008547008548,
  'np': 0.017094017094017096,
  'octylphenol': 0.008547008547008548,
  'op': 0.03418803418803419,
  'samples': 0.008547008547008548,
  'examine': 0.008547008547008548,
  'related': 0.008547008547008548,
  'factors': 0.008547008547008548,
  'including': 0.008547008547008548,
  'mothers': 0.017094017094017096,
  'demographics': 0.008547008547008548,
  'women': 0.008547008547008548,
  'consumed': 0.01709

In [6]:
# total number of documents in corpus
no_of_docs = len(corpus.index)
print(no_of_docs)

3193


In [91]:
# term - key, number of docs term occured in
def count_occurances():
    count_dict = {}
    for key in tf_dict:
        for key in tf_dict[key]:
            if key in count_dict:
                count_dict[key] += 1
            else:
                count_dict[key] = 1
    return count_dict

# test if count_occurances works
count_oc = count_occurances()
count_oc["alkylphenols"] # checked with Elina, good

# number of alkylphenols occurence in entire corpus = 7


7

In [95]:
# number of all not unique words to check total number of words (just fyi)
total = 0
for index, row in corpus.iterrows():
    total += len(tokenize(row['TEXT']))
print(total)

467016


In [11]:
# having total number of documents and number of occurances of each word in entire corpus we can calculate 
# idf for each term as log(total # of documents / # of documents with term in it)

# idf is calculated per each term, thus we create dictionary with term as a key and idf as a value
def idf():
    
    idf_dict = {}
    for key in count_oc:
        idf_dict[key] = math.log(no_of_docs/count_oc[key])
    return idf_dict

idf = idf()

# test if idf function works
idf["alkylphenols"]

# alkylphenols idf = 6.122806043659469

6.122806043659469

In [103]:
# cosntructing the final tf-idf dictionary; tf-idf is calculated as tf-idf(t, d) = tf(t, d) * idf(t)
# so for each key in tf dict we have to miltiply it with corresponsinf idf value

def tf_idf():
    d = copy.deepcopy(tf_dict)
    for doc, value in d.items():
        for word, value in d[doc].items():
            d[doc][word] = value * idf[word]
    return d

# test if tf_idf works
a = tf_idf()
print('Result from def:')
print(a[0]["alkylphenols"])

# excpected result for (term, doc) --> (alkylphenols, 0) =  0.008547008547008548 * 6.122806043659469 = 0.05
print('Manual result:')
idf["alkylphenols"] * tf_dict[0]["alkylphenols"]

# it works :) 


Result from def:
0.05233167558683307
Manual result:


0.05233167558683307

#### Coparing TF-IDF vectors (cosine similarity)

In [111]:
import numpy as np 

In [115]:
# First we have to build TF-IDF matrix based on obtain dictionary. 
# Rows will correspond to docs in the corpus, while columns will represent unique words

#              word1       ...          wordn
#  doc1   tf_idf_value   ...      tf_idf_value
#  ...    tf_idf_value   ...      tf_idf_value
#  docn   tf_idf_value   ...      tf_idf_value
#

tf_idf_matrix = pd.DataFrame.from_dict(a, orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
tf_idf_matrix.head(10)

Unnamed: 0,alkylphenols,human,milk,relations,dietary,habits,central,taiwan,pubmed,ncbi,...,tuscany,studies-depression,suicides,eurosave,self-inflicted,eurostat,upward,suicide-recording,scarcity,trim-and-fill
0,0.052332,0.041372,0.079999,0.046407,0.021178,0.060818,0.029952,0.047041,0.002278,0.002334,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001777,0.00182,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.028372,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.022663,0.0,0.0,0.0,0.001625,0.001665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001549,0.001588,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.109472,0.0,0.007245,0.0,0.0,0.0,0.001559,0.001597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002082,0.002133,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001945,0.001993,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.016298,0.0,0.0,0.0,0.0,0.0,0.0,0.002692,0.002758,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [113]:
# cheching if my dataframe has actually set of unique words :)
words = sorted(tf_idf_matrix.index)
words 

wordsSet = set(words)
if len(wordsSet) == len(words):
    print("matrix has unique words")
else:
    print("Error: matrix has not unique words")

matrix has unique words


In [130]:
# Now we have to compare docs by computing cosine similarity between each vector (row) in dataframe
# For that we need to obtain 1. vector magintude 2. dot product between two vectors

def vector_magnitude(v):
    return np.linalg.norm(v)

def dot_product(v1, v2):
    return np.dot(v1,v2)

# Creating cosine similarity table (should be 3193 x 3193)
def cosine_similarity(v1, v2):
    return dot_product(v1, v2)/ vector_magnitude(v1) * vector_magnitude(v2)

# Here cosine_similarity() should be perfomed on entire dataframe.
# ????

#### Inverted index, tiered index

In [191]:
# First let's try to implement champion list, means two-layer indexing
# The result for each term should be {term: {0: [top_rank_list], 1: [lower_rank_list]}
# For weighting we are going to use tf() function


# Simple inverted index based on tf_dict {term: [list of docs], term1: [list of docs]}
def inverted_index():
    ii_dict = {}
    for doc in tf_dict:
        for term in tf_dict[doc]:            
            if term in ii_dict:
                ii_dict[term].append(doc)
            else:           
                ii_dict[term] = list()
                ii_dict[term].append(doc)
    return ii_dict

ii_dict = inverted_index()
ii_dict["alkylphenols"]

[0, 1262, 1263, 1270, 1271, 1277, 1280]

In [192]:
# Now we have to modify inverted index by adding tf values for each doc 
# {term: {0: tf_value, 1: tf_value, ... , n: tf_value }}

def tf_inverted_index():
    tf_ii_dict = {}
    for doc in tf_dict:
        for term in tf_dict[doc]:
            if term not in tf_ii_dict:
                inner_dict = {}
                tf_ii_dict[term] = inner_dict
                inner_dict[doc] = tf_dict[doc][term]
            else:
                tf_ii_dict[term][doc] = tf_dict[doc][term]
    return tf_ii_dict

tf_ii_dict = tf_inverted_index()
tf_ii_dict["alkylphenols"]

{0: 0.008547008547008548,
 1262: 0.016260162601626018,
 1263: 0.023255813953488372,
 1270: 0.008547008547008548,
 1271: 0.010309278350515464,
 1277: 0.02127659574468085,
 1280: 0.012121212121212121}

In [None]:
# Next, we are modifying tf_ii_dict in order to obtain k tieres according to treshholds t1 and t2

def max_term():
    for term in tf_ii_dict:
        for doc in tf_ii_dict[term]:
            print(tf_ii_dict[term][doc].values)

max_term()
#t1 = 0.8 * term_max
#t2 = 0.5 * term_max
        