## This program is used to query the index with search terms

In [1]:
from bs4 import BeautifulSoup
import string
import re
from nltk.tokenize import word_tokenize
from collections import Counter
import matplotlib.pyplot as plt
import nltk as nltk
import numpy as np
import pickle
from scipy.sparse import coo_matrix
import math

In [2]:
# decalre names of all files generated in part 2A
vocabulary_index_file_path = ".\\data\\vocabulary.pkl"
document_index_file_path = ".\\data\\file_index.pkl"
inverted_index_file_path = ".\\data\inverted_index.pkl"

# Search queries are in following block. Change them as needed.

In [3]:
# set the input search queries
common_nouns_query = "Irish language"#"American writer" #"church of england"
proper_nouns_query = "Joy and Lewis"
rare_terms_query = "tuomas Cosmos holopainen sojourner lakeview Testament"
ambiguous_query = "Chinese England Testament"
complex_query = "Chinese writer sitting at Church of England"

queries = [common_nouns_query, proper_nouns_query, rare_terms_query, ambiguous_query, complex_query]

In [4]:
# Read vocabulary index
pickle_file = open(vocabulary_index_file_path, "rb")
vocabulary_dict = pickle.load(pickle_file)
pickle_file.close()

# Read document index
pickle_file = open(document_index_file_path, "rb")
document_dict = pickle.load(pickle_file)
pickle_file.close()

# Read term frequency matrix
pickle_file = open(inverted_index_file_path, "rb")
inverted_index_matrix = pickle.load(pickle_file)
pickle_file.close()

In [5]:
total_number_of_unique_words = len(vocabulary_dict)
print("Total number of unique words : ", total_number_of_unique_words)

Total number of unique words :  42040


In [6]:
# invert the document index, we will need this to find document id's in the last step
indexed_docs = dict()
for doc_id, index in document_dict.items():
    indexed_docs[index] = doc_id

## methods for calculating lnc and ltc scores.

In [7]:
# calculating LTC for the query
def calc_ltc(document, indexes):
    # calculate log tf of lnc score for the documents
    vector_length = len(document)
    log_tf_value = np.zeros(vector_length)
    log_tf_value[indexes] = 1 + np.log10(document[indexes])

    # calculate idf of lnc score
    idf_value = np.ones(vector_length)
    total_number_of_documents = inverted_index_matrix.shape[1]
    #print("total num docs=", total_number_of_documents)
    for token_index in indexes:
        relevent_number_of_documents = inverted_index_matrix.getrow(token_index).count_nonzero()
        idf_value[token_index] = np.log10(total_number_of_documents/relevent_number_of_documents)
    #print("idf_value[indexes]=", idf_value[indexes])
    # calcuate weights
    weights = log_tf_value * idf_value

    # perform cosine normalization
    sum_of_squares = sum(weights * weights)
    cosine_theta = 1 / np.sqrt(sum_of_squares)

    length_norm_values = weights * cosine_theta
    return length_norm_values

# calculating LNC for the document
def calc_lnc(document, indexes):
    # calculate log tf of lnc score for the documents
    vector_length = len(document)
    log_tf_value = np.zeros(vector_length)
    for index in indexes:
        if document[index] > 0:
            log_tf_value[index] = 1+np.log10(document[index])
        else:
            log_tf_value[index] = 0

    # calculate idf of lnc score
    idf_value = np.ones(vector_length)

    # calcuate weights
    weights = log_tf_value * idf_value
    #print("weights=", weights[indexes])
    # perform cosine normalization
    sum_of_squares = sum(weights[indexes] * weights[indexes])
    #print("sum_of_squares=", sum_of_squares)
    cosine_theta = 90
    if sum_of_squares > 0:
        cosine_theta = 1 / np.sqrt(sum_of_squares)
    #print("cosine_theta=", cosine_theta)
    
    length_norm_values = weights * cosine_theta
    return length_norm_values

## following block processes the queries and prints the output for each query

In [8]:
# for each query, search the top 10 relevent documents
for query in queries:
    query = query.lower()
    query_tokens = word_tokenize(query)
    query_indexes = []
    query_doc = np.zeros(total_number_of_unique_words)
    for token in query_tokens:
        if vocabulary_dict.get(token) is None:
            #print("vocabulary ", token, " is not preseent ")
            continue
        if vocabulary_dict.get(token) not in query_indexes:
            #print("vocabulary ", token, " is at index ", vocabulary_dict.get(token))
            query_indexes.append(vocabulary_dict.get(token))
        query_doc[vocabulary_dict.get(token)] =  query_doc[vocabulary_dict.get(token)] + 1
    # find ltc for query
    ltc_array = calc_ltc(query_doc, query_indexes)[query_indexes]
    print("Searching documents for query = ", query)
    #print(query_doc[query_indexes])

    # find similarity with all documents
    relevency_array = []
    for i in range (0, inverted_index_matrix.shape[1]):
        raw_tf_array = inverted_index_matrix.getcol(i).toarray().reshape(total_number_of_unique_words)
        doc_to_match = np.zeros(total_number_of_unique_words)
        doc_to_match[query_indexes] = raw_tf_array[query_indexes]
        doc_to_match = np.zeros(total_number_of_unique_words)
        #print(raw_tf_array[query_indexes])
        doc_to_match[query_indexes] = np.nan_to_num(raw_tf_array[query_indexes])
        # calculating LNC for the document
        lnc_array = calc_lnc(doc_to_match, query_indexes)[query_indexes]
        product_array = lnc_array * ltc_array
        #print("Product = ", product_array)
        relevency = sum(product_array)
        relevency_array.append((indexed_docs[i], relevency))

    sorted_relevency_array = sorted(relevency_array, key=lambda sim: sim[1], reverse = True)
    
    # Show relevancy score for top 10 documents
    for doc_id, score in sorted_relevency_array[:10]:
        print("Document with ID = ", doc_id, " has relevancy score of ", score)
    print("\n")

print("Search Completed!")

Searching documents for query =  irish language
Document with ID =  5813  has relevancy score of  0.9999490723489033
Document with ID =  5830  has relevancy score of  0.999501739384538
Document with ID =  6501  has relevancy score of  0.9985334533915411
Document with ID =  6514  has relevancy score of  0.9971121745966887
Document with ID =  6561  has relevancy score of  0.9935252057683959
Document with ID =  6532  has relevancy score of  0.9784680085744129
Document with ID =  6643  has relevancy score of  0.9784680085744129
Document with ID =  6677  has relevancy score of  0.9784680085744129
Document with ID =  6546  has relevancy score of  0.9646280018786675
Document with ID =  6678  has relevancy score of  0.9217706352224505


Searching documents for query =  joy and lewis
Document with ID =  5813  has relevancy score of  0.7252621949553562
Document with ID =  6652  has relevancy score of  0.3900677404351355
Document with ID =  6533  has relevancy score of  0.37657739061234785
Docume