**Informaiton Retrieval Programming Assignment #3**
<br>**Real-world Indexing System(final part)**
<br>Build a simple but true to practice retrieval engine based on the cosine similarity vector space model
<br>- Build index of file(dictionary + inverted file) for a 360 MB sized test collection of biomedical articles related to COVID-19
<br>- Inverted file is written to disk as a binary file
<br>- Use TF/IDF term weighting for both documents and the query
<br>- Compute cosine similarity for all documents in collection
<br>- Have a query processing program to load the dictionary and retrive posting lists for terms from inverted file in order to rank documents for provided set of queries
<br>- Run a comparision test of whether longer queries lead to more accurate retrieval results

<br><br>**Author:** Helen Ting He; **Date:** Oct 16, 2021

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import nltk
from nltk import word_tokenize
import string #remove punctuation
from collections import Counter, defaultdict, OrderedDict
import re
import time
import json
import math

nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# save to google colab
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/' 

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
# read original files
with open('cord19.txt') as f:
    lines = f.readlines()
cord_file = pd.DataFrame(list(zip(lines)))

cord_key_url = 'http://pmcnamee.net/744/data/cord19.topics.keyword.txt'
cord_key = pd.read_csv(cord_key_url,sep='\n',header=None)

cord_qes_url = 'http://pmcnamee.net/744/data/cord19.topics.question.txt'
cord_qes = pd.read_csv(cord_qes_url,sep='\n',header=None)

animal_url = 'http://pmcnamee.net/744/data/animal.txt'
animal = pd.read_csv(animal_url,sep='\n',header=None)

ani_top_url = 'http://pmcnamee.net/744/data/animal.topics.txt'
ani_top = pd.read_csv(ani_top_url,sep='\n',header=None)

In [10]:
# Test files with 3000 documents 
with open('cord19_test.txt') as f:
    lines = f.readlines()
test_file = pd.DataFrame(list(zip(lines)))

with open('animal.txt') as f:
    lines = f.readlines()
test_animal_file = pd.DataFrame(list(zip(lines)))

with open('animal.topics.txt') as f:
    lines = f.readlines()
test_animal_query = pd.DataFrame(list(zip(lines)))

"with open('animal.txt') as f:\n    lines = f.readlines()\ntest_animal_file = pd.DataFrame(list(zip(lines)))\n\nwith open('animal.topics.txt') as f:\n    lines = f.readlines()\ntest_animal_query = pd.DataFrame(list(zip(lines)))"

In [4]:
# Num of docs in file
substring = "<P ID="
substring_q = "<Q ID="
def num_doc(data_frame):
  doc = 0
  for sent_i in range(len(data_frame[0])):
    if substring in data_frame[0][sent_i] or substring_q in data_frame[0][sent_i]:
      doc += 1
  return doc
print("cord_19 dataset has " + str(num_doc(cord_file)) + " documents")
print("cord_key dataset has " + str(num_doc(cord_key)) + " documents")
print("cord_key (full) dataset has " + str(num_doc(cord_qes)) + " documents")

cord_19 dataset has 191175 documents
cord_key dataset has 50 documents
cord_key (full) dataset has 50 documents


In [5]:
#######################################
# Initilization
# contains 1) normalization
# 2) create posing list 
# 3) create index (dictionary and inverted file)
#######################################

def normalization(each_para):
  # To normalize each paragraph
  # @input: each phragraph, string
  # @output: counter of word for each paragraph, dict
  # normalized method:
  # 1. lower case 
  # 2. complete notation of apostrophe
  # 3. lemmenization using wordnetLemmatizer()
  # 4. remove word contain non-alphabetic character (except letter, number, underscore)
  output_list_para = []
  counter_list_para = {}
  each_para = each_para.lower()
  each_para = each_para.replace("'re",' are').replace("'s",' is').replace("'ll",' will').replace("n't",' not')
  each_para = each_para.replace("'",'')
  each_para = each_para.translate(str.maketrans('','',string.punctuation))
  each_para = each_para.replace("–", "")
  each_para = each_para.replace("-", "_")
  lem = nltk.stem.wordnet.WordNetLemmatizer()
  tokens = each_para.strip().split()
  clean_tokens = [t for t in tokens if re.match(r'[^\W\d]*$', t)]
  for word in clean_tokens:
    word = lem.lemmatize(word)
    output_list_para.append(word)   
    counter_list_para = Counter(output_list_para)
  return counter_list_para

def readFile(whole_text): 
  # To read the text and process relevant methods
  # @input: input raw text, dataframe
  # @output: paragraph one by one, string
  i = 0
  substring1 = "<P ID="
  substring2 = "</P>"
  substring1_q = "<Q ID="
  substring2_q = "</Q>"
  len_file = len(whole_text)
  output_wordlist_dict = {}
  for sent_i in range(len(whole_text)):
    if substring1 in whole_text[0][sent_i] or substring1_q in whole_text[0][sent_i]:
      para = ""
    if (substring1 not in whole_text[0][sent_i] and substring2 not in whole_text[0][sent_i]) and (substring1_q not in whole_text[0][sent_i] and substring2_q not in whole_text[0][sent_i]):
      para = para + whole_text[0][sent_i]
    if substring2 in whole_text[0][sent_i] or substring2_q in whole_text[0][sent_i]:
      output_wordlist_dict[i] = normalization(para)
      i = i + 1
  return output_wordlist_dict

def posting_list(docid_counter):
  # convert into posting list
  # @input:dict{ [docid]:dict{[word]: word freq per document} }, dict of dict
  # @ouput:dict{ [word]:dict{[docid]: term count} }, dict of dict
  posting_list_output = defaultdict(lambda: Counter([]))
  for doc, cnt in docid_counter.items():
    for word, word_cnt in cnt.items():
      posting_list_output[word][doc] += word_cnt
  return posting_list_output

def dictionary(posting_list):
  # To generate the unique sorted vocabulary list as key
  # DF and sorted offset as value 
  # @input: dict{ [word]:dict{[docid]: term count} }, dict of dict
  # @output: sorted unique vocabulary dict with DF and offset 
  sort_dict = {}
  result_sort_dict = {}
  offset_sum = 0
  offset_i = 0
  od = OrderedDict(sorted(posting_list.items()))
  sort_dict = OrderedDict(od)
  for i, word_i in enumerate(sort_dict.keys()):
    offset_i = len(sort_dict[word_i]) * 2 #offset 
    result_sort_dict[word_i] = len(sort_dict[word_i].values()),offset_sum #DF
    offset_sum = offset_sum + offset_i 
  return result_sort_dict

def inverted_file(key,posting_list):
  # To generate inverted file
  # @input: sorted key, list
  #         posting list, dict of dict
  # @output: inverted file, list
  inverted_list = []
  for word_i in key:
    for docid, cnt in posting_list[word_i].items():
      inverted_list.append(docid)
      inverted_list.append(cnt)
  return inverted_list

In [6]:
#######################################
# Cacualate weighting
# contain (1)IDF for corpus
# (2)TF-IDF for each document and query
# (3)length of each document or query
#######################################
def idf_corpus(dict_corpus,original_file):
  # To caculate IDF for corpus
  # @input: dictionary file of corpus
  #         original file
  # @output: dictionary (word as key, IDF as value)
  idf_dict = {}
  N_corpus = num_doc(original_file)
  for key_i in dict_corpus.keys():
    tf_i = dict_corpus[key_i][0]
    idf_i = math.log(N_corpus/tf_i,2)
    idf_dict[key_i] = idf_i
  return idf_dict

def tf_idf(post_list,idf_matrix):
  # To caculate TF-IDF for word
  # @input: posting list of doc
  #         idf of corpus
  # @output: dictionary(docid as key: word as key: TF-IDF)
  weight_matrix = defaultdict(lambda: defaultdict(float))
  for doc_i in post_list.keys():
    for word_i in post_list[doc_i].keys():
      if word_i not in idf_matrix:
        temp = 0.0
      else:
        temp = idf_matrix[word_i];
      weight_i = post_list[doc_i][word_i] * temp
      weight_matrix[doc_i][word_i] = weight_i
  return weight_matrix

def length_M(weight):
  # To caculate length of each doc/query
  # @input: dict of TF-IDF
  # @output: dictionary(doci as key: length as value)
  length_matrix = defaultdict(float)
  for doc_i in weight.keys():
    voc_weight_square_sum = 0.0
    voc_weight_sum = 0.0
    for word_i in weight[doc_i].keys():
      if weight[doc_i][word_i] !=0.0:
        voc_weight_square_sum = math.pow(weight[doc_i][word_i],2) + voc_weight_square_sum
    voc_weight_sum = math.sqrt(voc_weight_square_sum)
    length_matrix[doc_i] = voc_weight_sum
  return length_matrix


In [7]:
#######################################
# Cosine Similarity
#######################################

def cos_sim_pre(original_file, weight_query,weight_corpus,dict_corpus,invertd_corpus,length_corpus, threshold):
  # To caculate cos similarity for each query
  # @input: original file of corpus
  #         weight matrix of query
  #         weight matrix of corpus
  #         dictionary of corpus
  #         inverted file of corpus
  #         length of corpus
  #         threshold: decide which weight will be calculate
  # @output: cos_sim for each query and each doc
  temp = defaultdict(lambda: defaultdict(lambda:defaultdict(float)))
  for query_i in weight_query.keys():
    for query_word_i in dict_corpus.keys():
      if weight_query[query_i][query_word_i] > threshold: 
        df_word_i = dict_corpus[query_word_i][0]
        offset_word_i = dict_corpus[query_word_i][1]
        for invert_i in range(offset_word_i, offset_word_i + df_word_i * 2,2):
          temp[query_i][invertd_corpus[invert_i]][query_word_i] = weight_query[query_i][query_word_i] * weight_corpus[invertd_corpus[invert_i]][query_word_i]
  return temp

def cos_sim(original_file, weight_query,weight_corpus,dict_corpus,invertd_corpus,length_corpus, threshold):
  cos_sim_M = defaultdict(lambda: defaultdict(float))
  temp = cos_sim_pre(original_file, weight_query,weight_corpus,dict_corpus,invertd_corpus,length_corpus, threshold)
  for query_i in weight_query.keys():
    for doc_i in range(num_doc(original_file)):
      temp_sum = 0
      for word_j in temp[query_i][doc_i]:        
        temp_sum = temp_sum + temp[query_i][doc_i][word_j]
        cos_sim_M[query_i][doc_i] = 0.0
        if length_corpus[doc_i]!=0.0:
          cos_sim_M[query_i][doc_i] = temp_sum/(length_corpus[doc_i])
  return cos_sim_M


In [22]:
#######################################
# Ranked List
#######################################

def rank_list(cos_matrix):
  rank_file = []
  temp = {}
  for query_i in cos_matrix.keys():
    temp = sorted(cos_matrix[query_i].items(), key = lambda x: x[1], reverse= True)
    if len(temp) > 100:
      for i in range(100):
        row = []
        row.append(query_i+1)
        row.append("Q0")
        row.append(temp[i][0])
        row.append(i+1)
        row.append(temp[i][1])
        row.append('the14' )
        rank_file.append(row)
  return rank_file


In [14]:
temp = sorted(cos_M_covid_key[0].items(), key = lambda x: x[1], reverse= True)

In [21]:
for i in range(50):
  temp = sorted(cos_M_covid_key[i].items(), key = lambda x: x[1], reverse= True)
  if len(temp) == 26:
    print(i)

40


In [8]:
#######################################
# Main Function (part 1)
# run shorter queries
# (1)create index
#######################################
# create index for query
start_norm = time.time()
normal_query_key = readFile(cord_key) #nomarlized query dataset
print("--- %s seconds to normalize query key----"% (time.time() - start_norm))

#posting list
start0 = time.time()
postlist_query_key_result = posting_list(normal_query_key) 
print("--- %s seconds to buid posting list for query key----"% (time.time() - start0))

#dictionary
start1 = time.time()
dict_query_key_output = dictionary(postlist_query_key_result) 
print("cord_key dataset has " + str(len(dict_query_key_output)) + " unique vocabulary set")
print("--- %s seconds to buid dictionary for query key----"% (time.time() - start1))
#save dictionary
with open('gdrive/My Drive/dictionary_covid_query_key.txt', 'w') as f: 
    f.write(json.dumps(dict_query_key_output))

#inverted file
start2 = time.time()
inverted_query_key = inverted_file(dict_query_key_output.keys(),postlist_query_key_result)
print("--- %s seconds to buid inverted file for query key----"% (time.time() - start2))
# save inverted file
with open("gdrive/My Drive/inverted_file_covid_query_key.bin", "wb") as fb:
  for num in inverted_query_key:
    fb.write(num.to_bytes(4, "big"))

########################
# create index for covid
start_norm_file = time.time()
normal_file = readFile(cord_file) #nomarlized query dataset
print("--- %s seconds to normalize covid dataset----"% (time.time() - start_norm_file))

#posting list
start3 = time.time()
postlist_result = posting_list(normal_file) 
print("--- %s seconds to buid posting list for covid dataset----"% (time.time() - start3))

#dictionary
start4 = time.time()
dict_output = dictionary(postlist_result) 
print("covid dataset has " + str(len(dict_output)) + " unique vocabulary set")
print("--- %s seconds to buid dictionary for covid dataset----"% (time.time() - start4))
#save dictionary
with open('gdrive/My Drive/dictionary_covid.txt', 'w') as f: 
    f.write(json.dumps(dict_output))

#inverted file
start5 = time.time()
covid_inverted_file= inverted_file(dict_output.keys(),postlist_result)
print("--- %s seconds to buid inverted file for covid dataset---"% (time.time() - start5))
# save inverted file
with open("gdrive/My Drive/inverted_file_covid.bin", "wb") as fb:
  for num in covid_inverted_file:
    fb.write(num.to_bytes(4, "big"))


--- 1.5164504051208496 seconds to normalize query key----
--- 0.0004076957702636719 seconds to buid posting list for query key----
cord_key dataset has 98 unique vocabulary set
--- 0.00023055076599121094 seconds to buid dictionary for query key----
--- 0.000186920166015625 seconds to buid inverted file for query key----
--- 1667.0581352710724 seconds to normalize covid dataset----
--- 21.818220138549805 seconds to buid posting list for covid dataset----
covid dataset has 404445 unique vocabulary set
--- 3.108849287033081 seconds to buid dictionary for covid dataset----
--- 5.089956283569336 seconds to buid inverted file for covid dataset---


In [9]:
#######################################
# Main Function (part 1)
# run shorter queries
# (2)caculate weighting
#######################################

#####################
# caculate weight
start6 = time.time()
idf_matrix = idf_corpus(dict_output,cord_file)
covid_weight = tf_idf(normal_file,idf_matrix)
# length of corpus
length_covid = length_M(covid_weight)

# query(short)
covid_query_key_weight = tf_idf(normal_query_key,idf_matrix)
length_covid_key_query = length_M(covid_query_key_weight)
# TF-IDF for first query and doc
print("weight(TF-IDF) for first query of covid_key dataset:", "\n", covid_query_key_weight[0])
print("weight(TF-IDF) for first doc of covid dataset:", "\n", covid_weight[0])
print("--- %s seconds to parse query including caculate IDF over corpus---"% (time.time() - start6 ))


weight(TF-IDF) for first query of covid_key dataset: 
 defaultdict(<class 'float'>, {'coronavirus': 2.0378548589196765, 'origin': 5.312113421342007})
weight(TF-IDF) for first doc of covid dataset: 
 defaultdict(<class 'float'>, {'clinical': 6.649261361053281, 'feature': 12.176869732844686, 'of': 4.800052306944922, 'cultureproven': 42.66781876089216, 'mycoplasma': 30.59886634084056, 'pneumoniae': 51.21842883572701, 'infection': 24.710641457969302, 'at': 6.4621211341642475, 'king': 26.946875772891033, 'abdulaziz': 37.91293125872869, 'university': 15.802612827662887, 'hospital': 13.183165588455166, 'jeddah': 31.566499606468884, 'saudi': 28.477273780874796, 'arabia': 28.765550092080005, 'objective': 3.3861613422893524, 'this': 4.928699874612593, 'retrospective': 4.9446215063309555, 'chart': 15.855971009478187, 'review': 2.978599270977105, 'describes': 5.690445168647509, 'the': 4.069357622991405, 'epidemiology': 10.431157973473505, 'and': 5.498784773411431, 'patient': 17.51172596207369, 'wi

In [12]:
# 8 mins
keys = list(idf_matrix.keys())
values = [float(idf_matrix[k]) for k in keys]
pd.Series(values).describe()

count    404445.000000
mean         16.408394
std           1.989375
min           0.290668
25%          15.959572
50%          17.544534
75%          17.544534
max          17.544534
dtype: float64

In [10]:
#######################################
# Main Function (part 1)
# run shorter queries
# (3)caculate cos-similarity
#######################################
##########
# caculate cos-similarity
# run program
start7= time.time()
cos_M_covid_key = cos_sim(cord_file,covid_query_key_weight,covid_weight,dict_output,covid_inverted_file,length_covid, 1)
# save cos-smilarity matrix
with open('gdrive/My Drive/covid_key_cos_similairy.txt', 'w') as f: 
    f.write(json.dumps(cos_M_covid_key))
print("--- %s seconds to caculate cos similarity for covid dataset with short query---"% (time.time() - start7))

--- 611.5406761169434 seconds to caculate cos similarity for covid dataset with short query---


In [27]:
#######################################
# Main Function (part 1)
# run shorter queries
# (4) create ranked list file
#######################################
rank_file_key = rank_list(cos_M_covid_key )
#rank_file_full = rank_list(cos_M_covid_key_full )
# save rank_file
#with open('gdrive/My Drive/covid_rank_file_full.txt', 'w') as f: 
    #f.write(json.dumps(cos_M_covid_key_full))
with open('gdrive/My Drive/covid_rank_file_key.txt', 'w') as f: 
  for i in range(len(rank_file_key)):
    f.write("\t".join(str(x) for x in rank_file_key[i]) + '\n')

In [28]:
#######################################
# Main Function (part 2)
# run longer queries
# (1)create index
#######################################

########################
# create index for query(full)
start_norm = time.time()
normal_query_key_full = readFile(cord_qes) #nomarlized query dataset
print("--- %s seconds to normalize query key(full)----"% (time.time() - start_norm))

#posting list
start0 = time.time()
postlist_query_key_result_full = posting_list(normal_query_key_full) 
print("--- %s seconds to buid posting list for query key(full)----"% (time.time() - start0))

#dictionary
start1 = time.time()
dict_query_key_output_full = dictionary(postlist_query_key_result_full) 
print("cord_key (full) dataset has " + str(len(dict_query_key_output_full)) + " unique vocabulary set")
print("--- %s seconds to buid dictionary for query key(full)----"% (time.time() - start1))
#save dictionary
with open('gdrive/My Drive/dictionary_covid_query_key_full.txt', 'w') as f: 
    f.write(json.dumps(dict_query_key_output_full))

#inverted file
start2 = time.time()
inverted_query_key_full = inverted_file(dict_query_key_output_full.keys(),postlist_query_key_result_full)
print("--- %s seconds to buid inverted file for query key(full)----"% (time.time() - start2))
# save inverted file
with open("gdrive/My Drive/inverted_file_covid_query_key_full.bin", "wb") as fb:
  for num in inverted_query_key_full:
    fb.write(num.to_bytes(4, "big"))



--- 0.013148784637451172 seconds to normalize query key(full)----
--- 0.0007472038269042969 seconds to buid posting list for query key(full)----
cord_key (full) dataset has 223 unique vocabulary set
--- 0.0004363059997558594 seconds to buid dictionary for query key(full)----
--- 0.00040221214294433594 seconds to buid inverted file for query key(full)----


In [29]:
#######################################
# Main Function (part 2)
# run longer queries
# (2)caculate weighting
#######################################

# query(long)
idf_matrix = idf_corpus(dict_output,cord_file)
covid_query_key_weight_full = tf_idf(normal_query_key_full,idf_matrix)
length_covid_key_query_full = length_M(covid_query_key_weight_full)
# TF-IDF for first query 
print("weight(TF-IDF) for first query of covid_key dataset:", "\n", covid_query_key_weight_full[0])
print("--- %s seconds to parse query including caculate IDF over corpus---"% (time.time() - start6 ))


weight(TF-IDF) for first query of covid_key dataset: 
 defaultdict(<class 'float'>, {'what': 4.535405558978827, 'is': 0.7889556162382244, 'the': 0.2906684016422432, 'origin': 5.312113421342007, 'of': 0.3000032691840576})
--- 1832.507230758667 seconds to parse query including caculate IDF over corpus---


In [32]:
#######################################
# Main Function (part 2)
# run longer queries
# (3)caculate cos-similarity
#######################################

##########
# caculate cos-similarity
# run program
start7= time.time()
cos_M_covid_key_full = cos_sim(cord_file,covid_query_key_weight_full,covid_weight,dict_output,covid_inverted_file,length_covid,1)
# save cos-smilarity matrix
with open('gdrive/My Drive/covid_full_cos_similairy.txt', 'w') as f: 
    f.write(json.dumps(cos_M_covid_key_full))
print("--- %s seconds to caculate cos similarity for covid dataset with full query---"% (time.time() - start7))

--- 617.6173405647278 seconds to caculate cos similarity for covid dataset with full query---


In [34]:
#######################################
# Main Function (part 1)
# run longer queries
# (4) create ranked list file
#######################################

rank_file_full = rank_list(cos_M_covid_key_full )
# save rank_file
with open('gdrive/My Drive/covid_rank_file_full.txt', 'w') as f: 
  for i in range(len(rank_file_full)):
    f.write("\t".join(str(x) for x in rank_file_full[i]) + '\n')