In [None]:
import pandas as pd
import math
import copy
import numpy as np 
import itertools
import more_itertools as mit
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
import string
import re

In [180]:
# load some queries for testing
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,why deep fried foods may cause cancer in the l...
1,PLAIN-1007,"ddt - - persistent organic pollutants , indust..."
2,PLAIN-101,how to treat multiple sclerosis with diet mult...
3,PLAIN-1017,"detoxification - - cancer , raw food , heart h..."
4,PLAIN-1027,"dietary guidelines - - heart disease , cardiov..."
5,PLAIN-1038,"dogs - - meat , animal products , cats , heart..."
6,PLAIN-1049,"dr. david spence - - heart health , heart dise..."
7,PLAIN-1065,"dr. walter kempner - - mortality , heart disea..."
8,PLAIN-1077,"dulse - - thyroid health , hijiki , sushi , io..."
9,PLAIN-1087,"easter island - - mortality , muscle strength ..."


## Corpus Preprocessing

In [None]:
def preprocess_corpus(data):
    
    ps = PorterStemmer()
    
    def stemSentence(sentence,ps):
        token_words = word_tokenize(sentence)
        stem_sentence = []
        for word in token_words:
            stem_sentence.append(ps.stem(word))
            stem_sentence.append(" ")
        return "".join(stem_sentence)
    
    data['TEXT'] = data.apply(lambda x: stemSentence(x['TEXT'],ps), axis=1)
    
    return data

In [None]:
# enter your path of the corpus
path = 'C:/Users/48668/Desktop/FSS2020/IR/project/nfcorpus/'

# load corpus as preprocessed set of documents
corpus = pd.read_csv(path + 'dev.docs', sep='\t', names=['ID', 'TEXT'])

# corpus preprocessing
corpus = preprocess_corpus(corpus)
# preview first rows
corpus.head()

 ## Query Preprocessing

In [136]:
def preprocess_queries(corpus, queries):
    
    def remove_punctuations(text): # remove punctuation
        for punctuation in string.punctuation:
            text = text.replace(punctuation, '')
        return text

    def remove_numbers(text): # remove numbers
        return re.sub('[0-9]+', '', text)
    
    def lower_case(text): # lower case
        text = text.lower()
        return text 
    
    def tokenize(text): #tokenize
        return word_tokenize(text)
    
    stop = set(stopwords.words('english'))   
    def stop_words(tokens): # stop words 
        filtered_words = []
        for word in tokens:
            if word not in stop:
                filtered_words.append(word)
        return filtered_words
    
    ps = PorterStemmer()
    def stemming(tokens, ps): # stemming
        return [ps.stem(w) for w in tokens] 
    
    def corpus_vocab(corpus):
        vocab = []
        corpus_tokens = corpus.apply(lambda x: word_tokenize(x['TEXT']), axis=1)
        for i, j in corpus_tokens.iteritems():
            for token in j:
                if token not in vocab:
                    vocab.append(token)        
        return vocab
    
    v = corpus_vocab(corpus)    
    def filter_query(tokens):
        t = []
        for token in tokens:
            if token in v:
                t.append(token)
        return t
    
    # apply functions
    queries['TEXT'] = queries.apply(lambda x: remove_punctuations(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: remove_numbers(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: lower_case(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: tokenize(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stop_words(x['TEXT']), axis=1)
    queries['TEXT'] = queries.apply(lambda x: stemming(x['TEXT'],ps), axis=1)
    queries['TEXT'] = queries.apply(lambda x: filter_query(x['TEXT']), axis=1)
    
    return queries

In [137]:
# load some queries for testing
queries_text = pd.read_csv('nfcorpus/dev.all.queries', sep='\t', names=['ID', 'TEXT'])

queries_text = preprocess_queries(corpus, queries_text)
queries_text.head(10)

Unnamed: 0,ID,TEXT
0,PLAIN-1,"[deep, fri, food, may, caus, cancer, latest, s..."
1,PLAIN-1007,"[ddt, persist, organ, pollut, industri, toxin,..."
2,PLAIN-101,"[treat, multipl, sclerosi, diet, multipl, scle..."
3,PLAIN-1017,"[detoxif, cancer, raw, food, heart, health, he..."
4,PLAIN-1027,"[dietari, guidelin, heart, diseas, cardiovascu..."
5,PLAIN-1038,"[dog, meat, anim, product, cat, heart, health,..."
6,PLAIN-1049,"[dr, heart, health, heart, diseas, egg, choles..."
7,PLAIN-1065,"[dr, walter, mortal, heart, diseas, heart, hea..."
8,PLAIN-1077,"[thyroid, health, hijiki, sushi, iodin, sea, v..."
9,PLAIN-1087,"[easter, island, mortal, muscl, strength, morb..."


## Term frequency

In [158]:
# Term frequency
def tf(corpus, column_name):
    
    def tokenize(string):
        return string.split()
    
    def tf_string(string): 
        # create bag of words from the string
        bow = tokenize(string)
    
        tf_dict = {}
        for word in bow:
            if word in tf_dict:
                tf_dict[word] += 1
            else:
                tf_dict[word] = 1
            
        for word in tf_dict:
            tf_dict[word] = tf_dict[word]/len(bow)### ??
    
        return tf_dict
    
    # call our function on every doc and store all these tf dictionaries. 
    tf_dict = {}
    for index, row in corpus.iterrows():
        doc_dict = tf_string(row[column_name])
        tf_dict[index] = doc_dict
            
    return tf_dict

# Inversed document frequency

In [159]:
# Inversed document frequency
def idf(corpus, tf_dict):
    
    # nomber of documents in corpus
    no_of_docs = len(corpus.index)
    
    # term - key, number of docs term occured in
    def count_occurances(tf_dict):
        count_dict = {}
        for key in tf_dict:
            for key in tf_dict[key]:
                if key in count_dict:
                    count_dict[key] += 1
                else:
                    count_dict[key] = 1
        return count_dict

    idf_dict = {}
    
    count_dict = count_occurances(tf_dict)
    
    for key in count_dict:
        idf_dict[key] = math.log(no_of_docs/count_dict[key])
    
    return idf_dict

## TF-IDF

In [160]:
# TF-IDF
def tf_idf(tf_dict, idf_dict):   
    tf_idf_dict = copy.deepcopy(tf_dict)
    for doc, value in tf_idf_dict.items():
        for word, value in tf_idf_dict[doc].items():
            tf_idf_dict[doc][word] = value * idf_dict[word]
    return tf_idf_dict

## Cosine similarity

In [6]:
# Convert tf_idf_dict to matrix
def tf_idf_to_matrix(tf_idf_dict):
    tf_idf_matrix = pd.DataFrame.from_dict(tf_idf_dict, 
                                           orient = 'index').fillna(0) # if word does not appear in doc we change NaN to
    return tf_idf_matrix.sort_index()

In [7]:
# Cosine similarity
def cosine_similarity(v1, v2):
    
    def vector_magnitude(v):
        return np.linalg.norm(v)
    
    def dot_product(v1, v2):
        return np.dot(v1,v2)
    
    return dot_product(v1, v2)/ (vector_magnitude(v1) * vector_magnitude(v2))

## Inverted indexing

In [161]:
# Inverted index
def inverted_index(tf_dict):
    ii_dict = {}
    for doc in tf_dict:
        for term in tf_dict[doc]:            
            if term in ii_dict:
                ii_dict[term].append(doc)
            else:           
                ii_dict[term] = list()
                ii_dict[term].append(doc)
    return ii_dict

## Tiered indexing

In [173]:
# Tiered index
def tiered_index(corpus, chunks):
    
    print('Function is tested on term \'human\'. It performs following steps:')
    
    tf_dict = tf(corpus, 'TEXT')
    
    def tf_inverted_index(tf_dict):
        tf_ii_dict = {}
        for doc in tf_dict:
            for term in tf_dict[doc]:
                if term not in tf_ii_dict:
                    inner_dict = {}
                    tf_ii_dict[term] = inner_dict
                    inner_dict[doc] = tf_dict[doc][term]
                else:
                    tf_ii_dict[term][doc] = tf_dict[doc][term]
        return tf_ii_dict
    
    tf_ii_dict = tf_inverted_index(tf_dict)
    #print("\nInverted index:")
    #print(tf_ii_dict["human"])
    
    def sort_dict(tf_ii_dict):
        for doc in tf_ii_dict:
             tf_ii_dict[doc] = {k: v for k, v in sorted(tf_ii_dict[doc].items(), 
                                                        key=lambda item: item[1], reverse=True)} #explain
        return tf_ii_dict
    
    
    tf_ii_dict_sorted = sort_dict(tf_ii_dict)
    #print("\nSorted inverted index by tf(term, doc):")
    #print(tf_ii_dict_sorted["human"])
    
    def transform_dict(tf_ii_dict_sorted):
        new = {}
        for k,v in tf_ii_dict_sorted.items():
            new[k] = list(v)
        return new
    
    transformed = transform_dict(tf_ii_dict_sorted)
    #print("\nSorted inverted index without tf(term,doc) values:")
    #print(transformed["human"])
    
    def chunk_list(lst, chunks):
        return [list(x) for x in mit.divide(chunks, lst)]
    
    def chunk_dict(transformed, chunks):
        for term in transformed:
            doc_chunks = chunk_list(transformed[term],chunks)
            new = {}
            for i in range(0,len(doc_chunks)):
                new[i] = doc_chunks[i]
            transformed[term] = new
        return transformed
    
    tf_ii_dict_sorted = chunk_dict(transformed, chunks)
    
    def split_dict(tf_ii_dict_sorted, chunks):      
        i = itertools.cycle(range(chunks))       
        split = [dict() for _ in range(chunks)]
        for k, v in tf_ii_dict_sorted.items():
            split[next(i)][k] = v
        return split
    
    #for doc in tf_ii_dict_sorted:
    #    tf_ii_dict_sorted[doc] = split_dict(tf_ii_dict_sorted[doc],chunks)
        
    #print("\nChunked inverted index:")
    #print(tf_ii_dict_sorted["human"])
    
    def sort_chunks(tf_ii_dict_sorted):
        for term, tier in tf_ii_dict_sorted.items():
            for tier, lst in tf_ii_dict_sorted[term].items():
                lst.sort()
        return tf_ii_dict_sorted
    
    tf_ii_dict_sorted = sort_chunks(tf_ii_dict_sorted)
    
    #print("\nChunked inverted index with sorted chunks (tiered index):")
    #print(tf_ii_dict_sorted["human"])
    return tf_ii_dict_sorted

In [59]:
# Call the function on the corpus
tiered_index_dict = tiered_index(corpus, 4) 

Function is tested on term 'human'. It performs following steps:

Inverted index:
{66: 0.00558659217877095, 67: 0.00510204081632653, 141: 0.006097560975609756, 246: 0.006802721088435374, 885: 0.018292682926829267, 1179: 0.00625, 1444: 0.018633540372670808, 1570: 0.006622516556291391, 1806: 0.008, 1807: 0.004975124378109453, 1830: 0.0064516129032258064, 1933: 0.004524886877828055, 2114: 0.006211180124223602, 2176: 0.009900990099009901, 2177: 0.00625, 2679: 0.006134969325153374, 2723: 0.012269938650306749, 3085: 0.007194244604316547}

Sorted inverted index by tf(term, doc):
{1444: 0.018633540372670808, 885: 0.018292682926829267, 2723: 0.012269938650306749, 2176: 0.009900990099009901, 1806: 0.008, 3085: 0.007194244604316547, 246: 0.006802721088435374, 1570: 0.006622516556291391, 1830: 0.0064516129032258064, 1179: 0.00625, 2177: 0.00625, 2114: 0.006211180124223602, 2679: 0.006134969325153374, 141: 0.006097560975609756, 66: 0.00558659217877095, 67: 0.00510204081632653, 1807: 0.0049751243781

## Intersection algorithm

In [214]:
def inter_one_list(p1,p2): #posting 1 list, posting 2 list
    i=0
    j=0
    intersection = []
    
    while i < len(p1) and j < len(p2):
        if p1[i] == p2[j]:
            if i== 0 or p1[i] != p1[i-1]:
                intersection.append(p1[i])
            i += 1
            j += 1           
        elif p1[i] < p2[j]:
            i += 1
        else: # p[i] > p[j]
            j += 1     
    return intersection
        
def inter_n_lists(lst):
    
    rank_lst = sorted(lst, key = len)   
    intersection = []
    for i in range(len(rank_list)):
        if len(rank_lst) <= 1:
            intersection = rank_lst[0]
        elif len(rank_lst) == 2:
            intersection = inter_one_list(rank_lst[0], rank_lst[1])
        else:
            if i == 0:
                intersection = inter_one_list(rank_lst[i], rank_lst[i+1])
            elif i > 1:
                intersection = inter_one_list(rank_lst[i], intersection)
    return intersection


## Retrieve postings for terms in query

In [174]:
def retrieve_postings(corpus, queries, t_chunks):   
    
    tiered_index_dict = tiered_index(corpus, t_chunks)
   
    postings_tiers = []
    
    for i in range(0,len(queries)): 
        try:
            dic = {}
            dic[queries[i]] = tiered_index_dict[queries[i]]
            postings_tiers.append(dic)           
        except KeyError:
            pass

    return postings_tiers

In [175]:
# Retrieve postings for query tokens from tiered index
query_postigs = retrieve_postings(corpus, queries_text.iloc[0][1], 1)
query_postigs

Function is tested on term 'human'. It performs following steps:


[{'deep': {0: [377, 951, 956, 958, 1302, 1379, 1497, 2335, 2512, 2866, 3053]}},
 {'fri': {0: [13,
    109,
    142,
    169,
    179,
    942,
    951,
    956,
    966,
    1136,
    1138,
    1140,
    1141,
    1142,
    1143,
    1302,
    1419,
    1794,
    2221,
    2229,
    2231,
    2512,
    2790,
    2842,
    2866,
    2913,
    2917,
    2919,
    2961,
    2993,
    3001,
    3004,
    3020]}},
 {'food': {0: [0,
    3,
    4,
    5,
    16,
    20,
    22,
    27,
    50,
    51,
    52,
    55,
    58,
    59,
    60,
    65,
    77,
    80,
    88,
    90,
    93,
    97,
    99,
    103,
    109,
    117,
    123,
    125,
    126,
    129,
    130,
    131,
    132,
    136,
    139,
    140,
    141,
    142,
    145,
    148,
    149,
    150,
    154,
    155,
    157,
    158,
    160,
    162,
    169,
    179,
    181,
    186,
    187,
    191,
    192,
    194,
    195,
    197,
    213,
    215,
    218,
    219,
    223,
    230,
    234,
    236,
    240,


## Iterating through tieres

In [183]:
# Retrieve postings for query tokens from tiered index
query_postigs = retrieve_postings(queries_text.iloc[0][1])

# Extract i-th tier from all postings

def tieres(query_postigs, n):
    l = []
    for i in range(len(query_postigs)): # for each posting
        d = query_postigs[i]
        key = [key for key in d.keys()][0]
        l.append(d[key][n]) # append i-th tier list

    return l

l = tieres(query_postigs,3)
 # change to for in in no of tieres
print(len(l))
        

1845


In [195]:
# Intersection on tieres 
for i in range(325):
    retrieve_postings(queries_text.iloc[0][1])
    #print(inter_n_lists(tieres(retrieve_postings(queries_text.iloc[i][1]),2)))

    print(queries_text.iloc[i][1])
    

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [222]:
# Try custom query

a = retrieve_postings('sugar blod')
a = tieres(a,0)
a = inter_n_lists(a)

a

# Vanilla approach vs. splitting queries (??) 

[538, 569, 571, 572, 1772, 2118, 2607, 2825, 2832, 2925, 2948, 2974, 2976]