In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter, defaultdict, OrderedDict
import re
import sys
import os
import math
import string
import time
import operator
from itertools import islice

In [2]:
start_time = time.time()

In [3]:
cord19_file=open('data/cord19/cord19.txt',"r", encoding="UTF-8")
cord19_content = cord19_file.read()

In [4]:
cord19_key_file =open('data/cord19.topics.keyword.txt',"r")
cord19_key_content = cord19_key_file.read()

cord19_qs_file =open('data/cord19.topics.question.txt',"r")
cord19_qs_content = cord19_qs_file.read()

In [5]:
regextoken_P = RegexpTokenizer(r'<P ID=\d+>(.*?)</P>')
regextoken_Q = RegexpTokenizer(r'<Q ID=\d+>(.*?)</Q>')

cord19_text_list = regextoken_P.tokenize(cord19_content)
cord19_key_text_list = regextoken_Q.tokenize(cord19_key_content)
cord19_qs_text_list = regextoken_Q.tokenize(cord19_qs_content)

In [6]:
# Normalization
#     lower-case words
#     Change short term to long terms for verb.
#     remove punctuation
#         https://www.geeksforgeeks.org/python-remove-punctuation-from-string/
#     remove numbers

def normalization(word):
    word= word.lower()
    word = word.replace("'re",' are').replace("'m'", ' am').replace("'s",' is').replace("n't",' not').replace("'ve",' have').replace("'d",' had').replace("'ll",' will')
    word = word.replace("'",'')
    word  = re.sub(r'[^\w\s]', '', word)
    word = word.translate(str.maketrans('', '', string.punctuation))
    word = re.sub('[0-9]', '', word)
    return word

In [7]:
# Calculate Terms
# 1. Normalized the text 
# 2. It tokenized the text and count the occurence of the text
# 3. It returns the document id and count for each of the terms.
# 4. It also count the number of terms and documents. 

def calculate_terms(listed):
    normalized_text = []
    collection_frequency = Counter()
    document_frequency = Counter()
    output_wordlist_dict ={}
    terms_frequency = defaultdict(lambda: Counter([]))
    
    for i in range(len(listed)):
        normalized_text.append(normalization(listed[i]))
    
    for i in range(len(normalized_text)):
        tokenized_list= []
        for j in normalized_text[i].split():
            tokenized_list.append(j)
        output_wordlist_dict[i] = Counter(tokenized_list)
        collection_frequency.update(tokenized_list)
        document_frequency.update(set(tokenized_list))
        
    for key, value in output_wordlist_dict.items():
        for term, term_cnt in value.items():
            terms_frequency[term][key] += term_cnt
    
    return normalized_text, collection_frequency, document_frequency, output_wordlist_dict, terms_frequency

In [8]:
# Dictionary List
# 1. It stores information about a term (Document Frequency and offset)
# 2. The dictionary is sorted by term. 
# 3. It starts from 0 and counts the number document frequency and term count. 

def dictionary_list(listed):
    sort_dict = {}
    result_sort_dict = {}
    offset_sum = 0
    offset_i = 0
    sort_dict = OrderedDict(sorted(listed.items()))
    for i, value in enumerate(sort_dict.keys()):
        offset_i = len(sort_dict[value]) * 2 
        result_sort_dict[value] = len(sort_dict[value].values()),offset_sum
        offset_sum = offset_sum + offset_i 
    return result_sort_dict

In [9]:
# Inverted file
# 1. It stores the sorted entries as an inverted file

def inverted_file(key, dict_listed):
    inverted_list = []
    for i in key:
        for docid, term_cnt in dict_listed[i].items():
            inverted_list.append(docid)
            inverted_list.append(term_cnt)
    return inverted_list

In [10]:
# Store_Inverted_bin
# 1. It stores the inverted file as binary file. 
# 2. It stored this binary file as 4-byte integers. 

def Store_Inverted_bin(file, name):
    with open("Inverted_File/inverted_file_"+name+"_binary.bin", "wb") as fb:
        for num in file:
            fb.write(num.to_bytes(4, "big"))
    print("Inverted File " + name +" is created.")

In [11]:
# Calculate IDF
# 1. It gest posting list and length of the document
# 2. In posting list, it contains (frequency of the terms, offset of the terms)
# 3. To calculate the IDF, log2(Number of document / document frequency) 
def idf_corpus(dict_corpus,N_corpus):
    idf_dict = {}
    for key_i in dict_corpus.keys():
        tf_i = dict_corpus.get(key_i)[0]
        idf_i = math.log2(N_corpus/tf_i)
        idf_dict[key_i] = idf_i
    return idf_dict

In [12]:
#DFx IDF
# It gets term frequency for each documents.
# 1. It iterates through documents.
# 2. It iterates through terms in document.
# 3. If the term in document does not exit in IDF, it sets to 0.
# 4. Else it multiplies term freqeuncy by IDF.

def tf_idf(post_list,idf_matrix):
    weight_matrix =[]
    for i, j in post_list.items():
        idf ={}
        for k in j:
            if k not in idf_matrix:
                idf_matrix[k] = 0
            else:
                idf[k] = idf_matrix[k]*j[k]
        weight_matrix.append(idf)                       
    return weight_matrix

In [13]:
def vector_length(weight):
    length_matrix = {}
    for doc_i in range(len(weight)):
        length_matrx = []
        for i in weight[doc_i].values():
            length_matrx.append(i)
        sum_of_squares = sum(map(lambda k : k * k, length_matrx))
        vlength = math.sqrt(sum_of_squares)
        length_matrix[doc_i] = vlength
    return length_matrix

In [14]:
def cosine_similarities(doc_weight, query_weight, doc_length, query_length, query_term_freq):
    N = len(doc_weight)
    cos_score = []
    for i in range(len(query_term_freq)):
        cos_score.append([0]*N)
        for j  in query_term_freq[i].keys():
            query_tfidf = 0
            if query_weight[i].get(j):
                query_tfidf = query_weight[i].get(j)
            for k in range(len(doc_weight)):
                if(query_length[i] != 0) & (doc_length[k] != 0):
                    if(doc_weight[k].get(j)):
                        #Document Length * Query Length
                        doc_query_length = doc_length[k] * query_length[i]
                        # tf-idf weight of term in document * tf-idf weight of term in query
                        doc_query_vector = doc_weight[k].get(j) * query_tfidf
                        cos_score[i][k] += doc_query_vector / doc_query_length  
    return cos_score

In [15]:
def score_ranking(doc_weight, cos_score, jhuid, filename):
    print("Creating Score Ranking")
    N = len(doc_weight)
    score_results = []
    for score in cos_score:
        result =[]
        for i in range(N):
            result.append((score[i], i))
        result.sort(reverse= True)
        score_results.append(result)
    score_output = open(filename, "w")
    for query_id in range(len(score_results)):
        for j in range(100):
            doc_id, cos_score = score_results[query_id][j]
            score_output.write(str(query_id+1) + " Q0 " + str(cos_score) + " " +  str(j+1) + " " + str(doc_id) + " " + jhuid + '\n')
    score_output.close()
    print("Score Ranking file (" + filename+ " ) is created")

In [16]:
cord19_normalized_text, cord19_collection_freq, cord19_document_freq, cord19_term_freq, cord19_posting_list_output = calculate_terms(cord19_text_list)

In [17]:
print("Cord19")
print('Number of paragraph:', len(cord19_normalized_text))
print('Number of unique words observed:', len(cord19_document_freq))
print('The total number of words encountered:', sum(cord19_collection_freq.values()))

Cord19
Number of paragraph: 191175
Number of unique words observed: 450118
The total number of words encountered: 50097612


In [18]:
cord19_key_normalized_text, cord19_key_collection_freq, cord19_key_document_freq, cord19_key_term_freq, cord19_key_posting_list_output = calculate_terms(cord19_key_text_list)
cord19_qs_normalized_text, cord19_qs_collection_freq, cord19_qs_document_freq, cord19_qs_term_freq, cord19_qs_posting_list_output = calculate_terms(cord19_qs_text_list)

In [19]:
# Cord19 Keyword
print("Cord19 Keyword")
print('Number of paragraph:', len(cord19_key_normalized_text))
print('Number of unique words observed:', len(cord19_key_document_freq))
print('The total number of words encountered:', sum(cord19_key_collection_freq.values()))

print("Cord19 Question")
print('Number of paragraph:', len(cord19_qs_normalized_text))
print('Number of unique words observed:', len(cord19_qs_document_freq))
print('The total number of words encountered:', sum(cord19_qs_collection_freq.values()))

Cord19 Keyword
Number of paragraph: 50
Number of unique words observed: 101
The total number of words encountered: 162
Cord19 Question
Number of paragraph: 50
Number of unique words observed: 233
The total number of words encountered: 530


In [20]:
%%time
cord19_dict_pos_output = dictionary_list(cord19_posting_list_output)
cord19_byte_file = inverted_file(cord19_dict_pos_output.keys(), cord19_posting_list_output)
Store_Inverted_bin(cord19_byte_file, "cord19")

Inverted File cord19 is created.
CPU times: total: 33.5 s
Wall time: 36.8 s


In [21]:
print('Size of original_text: ' + str(os.path.getsize('data/cord19/cord19.txt')) + ' bytes')
print('Size of Cord19 Inverted File: ' + str(os.path.getsize('Inverted_File/inverted_fiile_cord19_binary.bin')) + ' bytes')
print('Size of Dictionary: ' + str(sys.getsizeof(cord19_dict_pos_output)) + ' bytes')

Size of original_text: 359302564 bytes
Size of Cord19 Inverted File: 190324096 bytes
Size of Dictionary: 20971608 bytes


In [22]:
print("Computing IDF, TF-IDF, Vector Length for Cord19 Document")
cord19_pre_start = time.time()
idf_matrix = idf_corpus(cord19_dict_pos_output,len(cord19_normalized_text))
cord19_weight = tf_idf(cord19_term_freq, idf_matrix)
cord19_length = vector_length(cord19_weight)
cord19_pre_end = time.time()
print("The time of execution of Cord19(IDF, TF-IDF, Vector Length) is :",(cord19_pre_end-cord19_pre_start), "seconds")

Computing IDF, TF-IDF, Vector Length for Cord19 Document
The time of execution of Cord19(IDF, TF-IDF, Vector Length) is : 51.611387491226196 seconds


## Cord19 Keyword

In [23]:
# %%time
# cord19_key_dict_pos_output = dictionary_list(cord19_key_posting_list_output)
# cord19_key_byte_file = inverted_file(cord19_key_dict_pos_output.keys(), cord19_key_posting_list_output)
# Store_Inverted_bin(cord19_key_byte_file, "cord19_key")

In [24]:
print("Computing IDF, TF-IDF, Vector Length for Cord19 Keyword Query")

cord19_key_pre_start = time.time()
#cord19_key_idf_matrix = idf_corpus(cord19_key_dict_pos_output,len(cord19_key_normalized_text))
cord19_key_weight = tf_idf(cord19_key_term_freq, idf_matrix)
cord19_key_length = vector_length(cord19_key_weight)
cord19_key_pre_end = time.time()
print("The time of execution Cord19 Keyword(IDF, TF-IDF, Vector Length) is :",(cord19_key_pre_end-cord19_key_pre_start), "seconds")

Computing IDF, TF-IDF, Vector Length for Cord19 Keyword Query
The time of execution Cord19 Keyword(IDF, TF-IDF, Vector Length) is : 0.003785848617553711 seconds


In [25]:
# cord19_key_idf_matrix

In [26]:
cord19_key_weight[0]

{'coronavirus': 2.026618955602212, 'origin': 5.533656625511048}

In [27]:
cord19_key_length[0]

5.893092570142482

## Cord19 Question

In [28]:
# %%time
# cord19_qs_dict_pos_output = dictionary_list(cord19_qs_posting_list_output)
# cord19_qs_byte_file = inverted_file(cord19_qs_dict_pos_output.keys(), cord19_qs_posting_list_output)
# Store_Inverted_bin(cord19_qs_byte_file, "cord19_qs")

In [29]:
print("Computing IDF, TF-IDF, Vector Length for Cord19 Question Query")
cord19_qs_pre_start = time.time()
# cord19_qs_idf_matrix = idf_corpus(cord19_qs_dict_pos_output,len(cord19_qs_normalized_text))
cord19_qs_weight = tf_idf(cord19_qs_term_freq, idf_matrix)
cord19_qs_length = vector_length(cord19_qs_weight)
cord19_qs_pre_end = time.time()
print("The time of execution of Cord19 Question (IDF, TF-IDF, Vector Length) is :",(cord19_qs_pre_end-cord19_qs_pre_start), "seconds")

Computing IDF, TF-IDF, Vector Length for Cord19 Question Query
The time of execution of Cord19 Question (IDF, TF-IDF, Vector Length) is : 0.004987001419067383 seconds


In [30]:
cord19_qs_weight[0]

{'what': 4.531386177063322,
 'is': 0.7887470080604575,
 'the': 0.29054840512573527,
 'origin': 5.533656625511048,
 'of': 0.3000032691840576,
 'covid': 1.3763323422808138}

In [31]:
cord19_qs_length[0]

7.3379594732539655

In [32]:
print("Cosine Score for Keyword")
cord19_key_start = time.time()
cos_score_keyword = cosine_similarities(cord19_weight, cord19_key_weight, cord19_length, cord19_key_length, cord19_key_term_freq)
score_ranking(cord19_weight, cos_score_keyword, 'dcho13','testing2/dcho13-a.txt' )
cord19_key_end = time.time()
print("The time of execution of Cord19 Keyword Cosine Score and Ranking is :",(cord19_key_end-cord19_key_start), "seconds")

Cosine Score for Keyword
Creating Score Ranking
Score Ranking file (testing2/dcho13-a.txt ) is created
The time of execution of Cord19 Keyword Cosine Score and Ranking is : 33.861650228500366 seconds


In [33]:
print("Cosine Score for Question")
cord19_qs_start = time.time()
cos_score_question = cosine_similarities(cord19_weight, cord19_qs_weight, cord19_length, cord19_qs_length, cord19_qs_term_freq)
score_ranking(cord19_weight, cos_score_question, 'dcho13','testing2/dcho13-b.txt' )
cord19_qs_end = time.time()
print("The time of execution of Cord19 Question Cosine Score and Ranking is :",(cord19_qs_end-cord19_qs_start), "seconds")

Cosine Score for Question
Creating Score Ranking
Score Ranking file (testing2/dcho13-b.txt ) is created
The time of execution of Cord19 Question Cosine Score and Ranking is : 93.19122266769409 seconds


In [34]:
print("Total execution time: %s seconds" % (time.time() - start_time))

Total execution time: 319.1791760921478 seconds
