In [1]:
import pandas as pd
import math
import copy
import numpy as np 
import itertools
import more_itertools as mit
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
import string
import re

from IPython import get_ipython

get_ipython().magic('run -i "1_preprocessing_corpus_queries.ipynb"')

## Term frequency

In [2]:
# Term frequency
def tf(corpus):
    
    def tokenize(string):
        return string.split()
    
    def tf_string(string): 
        # create bag of words from the string
        bow = tokenize(string)
    
        tf_dict = {}
        for word in bow:
            if word in tf_dict:
                tf_dict[word] += 1
            else:
                tf_dict[word] = 1
            
        for word in tf_dict:
            tf_dict[word] = tf_dict[word]/len(bow)### ??
    
        return tf_dict
    
    # call our function on every doc and store all these tf dictionaries. 
    tf_dict = {}
    for index, row in corpus.iterrows():
        doc_dict = tf_string(row["TEXT"])
        tf_dict[index] = doc_dict
            
    return tf_dict

## Inveresed document frequency

In [3]:
# Inversed document frequency
def idf(corpus):
    
    tf_dict = tf(corpus)
    
    # nomber of documents in corpus
    no_of_docs = len(corpus.index)
    
    # term - key, number of docs term occured in
    def count_occurances(tf_dict):
        count_dict = {}
        for key in tf_dict:
            for key in tf_dict[key]:
                if key in count_dict:
                    count_dict[key] += 1
                else:
                    count_dict[key] = 1
        return count_dict

    idf_dict = {}
    
    count_dict = count_occurances(tf_dict)
    
    for key in count_dict:
        idf_dict[key] = math.log(no_of_docs/count_dict[key])
    
    return idf_dict

## TF-IDF

In [4]:
# TF-IDF
def tf_idf(corpus):   
    
    tf_dict = tf(corpus)
    idf_dict = idf(corpus)
    
    tf_idf_dict = copy.deepcopy(tf_dict)
    for doc, value in tf_idf_dict.items():
        for word, value in tf_idf_dict[doc].items():
            tf_idf_dict[doc][word] = value * idf_dict[word]
    return tf_idf_dict

## Cosine similarity

In [5]:
# Cosine similarity
def cosine_similarity(v1, v2):
    
    def vector_magnitude(v):
        return np.linalg.norm(v)
    
    def dot_product(v1, v2):
        return np.dot(v1,v2)
    
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))

## TF-IDF to matrix

In [6]:
# Convert tf_idf_dict to matrix
def tf_idf_to_matrix(tf_idf_dict):
    tf_idf_matrix = pd.DataFrame.from_dict(tf_idf_dict, 
                                           orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
    return tf_idf_matrix.sort_index()

## Build tf-idf vectors for docs (once for the entire corpus)

In [7]:
def build_doc_vectors(corpus):
    tf_idf_dict = tf_idf(corpus)                              # tf-idf for docs
    doc_vectors= tf_idf_to_matrix(tf_idf_dict)                # tf-idf dictionary for docs converted to matrix
    return doc_vectors

## Build tf-idf vector for one query

In [8]:
def build_q_vector(query, doc_vectors, idf_dict):
    if type(query) == str:
        tokenized_query = query.split()
    else:
        tokenized_query = query

    df_query = doc_vectors[0:0]  # dataframe of tf-idf weights of a query
    df_query = df_query.append(pd.Series(0, index=df_query.columns), ignore_index=True)
    for token in tokenized_query:
        for col in df_query.columns:
            if token == col:
                df_query[col][0] = df_query[col][0] + 1  # raw term frequency

    df_query = df_query.replace(0, np.nan)

    df_query = np.log(df_query) + 1  # log term freq(as in the slides)

    df_query = df_query.fillna(0)

    for col in df_query.columns:
        df_query[col][0] = df_query[col][0] * idf_dict[col]

    return df_query.values

## Basic retrieval

In [9]:
def basic_retrieve(q_vector, doc_vectors, top_k, random_projections=False):
    """
    Retrieve top relevant document for input query.

    Parameters
    ----------
    doc_vectors:              tf-idf matrix of a corpus: build_doc_vectors() function
    q_vector:                 tf-idf vector for queries
    top_k:                    number of most relevant documents to be output
    random_projections=False  if retrieval should be done with random projections
    
    Returns
    -------
    df.iloc[ids]     dataframe with IDs of predicted top_k most relevant documents with their content
    """
    
    df = corpus.copy()

    sim = []                                                     # to store cosine similarities
    sort_sim = []                                                # sorted cosine similarities
    i = 0
    for doc in doc_vectors.values:
        #in case of random projections calculate dot product since our document and query vectors would be unit-normalized
        if random_projections:
            sim.append([i, np.dot(q_vector, doc)])
        else:
            sim.append([i, cosine_similarity(q_vector, doc)])
        i += 1
    sort_sim = sorted(sim, key=lambda cos: cos[1], reverse=True)
    ids = []

    for j in range(top_k):
        ids.append(sort_sim[j][0])

    return df.iloc[ids]

## TEST (find 5 most relevant documents for first query)

In [12]:
# load corpus 
corpus = preprocess_corpus(pd.read_csv('nfcorpus/dev.docs', sep='\t', names=['ID', 'TEXT']))

# load queries
queries = preprocess_queries(corpus, pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT']))

# call needed arguments
doc_vectors = build_doc_vectors(corpus)
idf_dict = idf(corpus)
q_vector = build_q_vector(queries['TEXT'][0], doc_vectors, idf_dict)

# call retrieve
test = basic_retrieve(q_vector = q_vector,
                      doc_vectors = doc_vectors, 
                      top_k = 5)
test

Unnamed: 0,ID,TEXT
1142,MED-2423,dietari pattern breast cancer risk women pubm ...
956,MED-2195,influenc deep fri veget oil acrylamid format s...
1794,MED-3498,dietari acrylamid exposur french popul result ...
1141,MED-2422,statist regress model estim acrylamid concentr...
1138,MED-2418,consumpt deep-fri food risk prostat cancera b ...


## Evalutaion for queries

## Evaluation for query titles