# Part2: Vector-space based IR System
## Query Lookup
### Objectives: 
#### 1. Load the saved the python dictionaries as pickle files to be used for query optimization.
#### 2. Make the query using query lookup() function.

In [1]:
# Import Required Libraries
from collections import Counter
import nltk
import numpy as np
import pickle
from scipy import spatial
import pickle

In [2]:
# Read the pickle files 
with open('documents_dict.pkl', 'rb') as documents_dict_handle:
    documents_dict = pickle.load(documents_dict_handle)
    
with open('tf_idf_vector.pkl', 'rb') as tf_idf_vector_handle:
    tf_idf_vector = pickle.load(tf_idf_vector_handle)
    
with open('bag_of_words.pkl', 'rb') as bag_of_words_handle:
    bag_of_words = pickle.load(bag_of_words_handle)
    
with open('documents_title_dict.pkl', 'rb') as documents_title_dict_handle:
    documents_title_dict = pickle.load(documents_title_dict_handle)

#### SMART Notation used
#### lnc.ltc(ddd.qqq)
#### ltc ==> Logarithmic tf + IDF + Cosine Normalization

In [3]:
# Function to generate query vector
def make_query_vector(documents_dict, bag_of_words, tokens):  
    documents_count = len(documents_dict.keys())
    all_tokens = list(bag_of_words.keys())

    token_with_index = {}
    for index, token in enumerate(all_tokens):
        token_with_index[token] = index

    query = np.zeros((len(all_tokens)))
    query_counter = Counter(tokens)

    # normalize
    cnt = 0
    for token in np.unique(tokens):
        #Logarithmic tf 
        tf = 1 + np.log10(query_counter[token]+1)        
        try:
            df = bag_of_words[token]['df']
        except:
            df = 0
        idf = np.log10((documents_count) / (df + 1))        
        cnt += (tf * tf * idf * idf)        
    cnt = np.sqrt(cnt)  
    for token in np.unique(tokens):
        tf = 1 + np.log10(query_counter[token]+1)
        try:
            df = bag_of_words[token]['df']
        except:
            df = 0
        idf = np.log10((documents_count) / (df + 1))      
        try:
            # Cosine Normalization
            query[token_with_index[token]] = (tf * idf) / cnt            
        except:
            pass
    return query

In [4]:
# Function for query lookup
def query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict):
    tokens = nltk.word_tokenize(query)

    print("\nQuery:", query)
    print("\nTokens:", tokens)

    all_cosines = {}

    query_vector = make_query_vector(documents_dict, bag_of_words, tokens)
    
    query_vector.sort() 
    print(query_vector[-10:]) 
    
    #print("\nQuery Vector: " + str(query_vector.tolist()))
    #print("\nQuery Vector length:", len(query_vector.tolist()))

    for doc_id, doc_value in tf_idf_vector.items():
        all_cosines[doc_id] = 1 - spatial.distance.cosine(query_vector, np.asarray(doc_value['tf_idf_vector']))

    all_cosines_sorted = {k: v for k, v in sorted(all_cosines.items(), reverse=True, key=lambda item: item[1])}
    all_cosines_sorted_counter = Counter(all_cosines_sorted)

    # Finding Top K=10 documents
    top_10_documents = all_cosines_sorted_counter.most_common(10)
    print(top_10_documents)
    for index, element in enumerate(top_10_documents):
        title = documents_title_dict[element[0]]
        print("Document:", index + 1, " Title:", title, " Score:", element[1])

### Execute the query as required ( This requires python dictionaries which needs to be in memory
### We load the saved pickle file.

In [5]:
query ='Madalena is a former civil parish in the municipality of Tomar, Portugal'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: Madalena is a former civil parish in the municipality of Tomar, Portugal

Tokens: ['Madalena', 'is', 'a', 'former', 'civil', 'parish', 'in', 'the', 'municipality', 'of', 'Tomar', ',', 'Portugal']
[0.         0.00237658 0.00320984 0.00411547 0.00478828 0.00958532
 0.11030909 0.1620805  0.19991    0.22611323]
[(2065119, 0.009485075072133475), (2058487, 0.007289733416690147), (2059316, 0.007289733416690147), (2060672, 0.007289733416690147), (2059854, 0.007289643218205244), (2055317, 0.007289609889421844), (2056349, 0.007289609889421844), (2056662, 0.007289609889421844), (2057762, 0.007289609889421844), (2057900, 0.007289609889421844)]
Document: 1  Title: Madalena (Tomar)  Score: 0.009485075072133475
Document: 2  Title: Subfund  Score: 0.007289733416690147
Document: 3  Title: Gluggy  Score: 0.007289733416690147
Document: 4  Title: Haruko  Score: 0.007289733416690147
Document: 5  Title: T28  Score: 0.007289643218205244
Document: 6  Title: Scienceworks  Score: 0.007289609889421844
Do