# Part2: Vector-space based IR System
## Query Lookup
### Objectives: 
#### 1. Load the saved the python dictionaries as pickle files to be used for query optimization.
#### 2. Make the query using query lookup() function.

In [2]:
# Import Required Libraries
from collections import Counter
import nltk
import numpy as np
import pickle
from scipy import spatial
import pickle

In [3]:
# Read the pickle files 
with open('documents_dict.pkl', 'rb') as documents_dict_handle:
    documents_dict = pickle.load(documents_dict_handle)
    
with open('tf_idf_vector.pkl', 'rb') as tf_idf_vector_handle:
    tf_idf_vector = pickle.load(tf_idf_vector_handle)
    
with open('bag_of_words.pkl', 'rb') as bag_of_words_handle:
    bag_of_words = pickle.load(bag_of_words_handle)
    
with open('documents_title_dict.pkl', 'rb') as documents_title_dict_handle:
    documents_title_dict = pickle.load(documents_title_dict_handle)

#### SMART Notation used
#### lnc.ltc(ddd.qqq)
#### ltc ==> Logarithmic tf + IDF + Cosine Normalization

In [4]:
# Function to generate query vector
def make_query_vector(documents_dict, bag_of_words, tokens):  
    documents_count = len(documents_dict.keys())
    all_tokens = list(bag_of_words.keys())

    token_with_index = {}
    for index, token in enumerate(all_tokens):
        token_with_index[token] = index

    query = np.zeros((len(all_tokens)))
    query_counter = Counter(tokens)

    # normalize
    cnt = 0
    for token in np.unique(tokens):
        #Logarithmic tf 
        tf = 1 + np.log10(query_counter[token]+1)        
        try:
            df = bag_of_words[token]['df']
        except:
            df = 0
        idf = np.log10((documents_count) / (df + 1))        
        cnt += (tf * tf * idf * idf)        
    cnt = np.sqrt(cnt)  
    for token in np.unique(tokens):
        tf = 1 + np.log10(query_counter[token]+1)
        try:
            df = bag_of_words[token]['df']
        except:
            df = 0
        idf = np.log10((documents_count) / (df + 1))      
        try:
            # Cosine Normalization
            query[token_with_index[token]] = (tf * idf) / cnt            
        except:
            pass
    return query

In [20]:
# Function for query lookup
def query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict):
        tokens = nltk.word_tokenize(query)

        print("\nQuery:", query)

        search_results = {}
        query_vector = make_query_vector(documents_dict, bag_of_words, tokens)

        for doc_id, doc_value in tf_idf_vector.items():
            cosine_similarity = 1 - spatial.distance.cosine(query_vector, np.asarray(doc_value['tf_idf_vector']))
            search_results[doc_id] = {"score": cosine_similarity, "title": documents_title_dict[doc_id]}
        
        # Get Top 10 documents
        out = [value for key, value in
               sorted(search_results.items(), key=lambda item: item[1]['score'], reverse=True)[:10]]
        return out

### Execute the query as required ( This requires python dictionaries which needs to be in memory
### We load the saved pickle file.

#### Query 1

In [29]:

query ='Final Fantasy Legend III, originally released in Japan as , is a role-playing video game'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: Final Fantasy Legend III, originally released in Japan as , is a role-playing video game


[{'score': 0.014713003730583929, 'title': "Wonder Boy III: The Dragon's Trap"},
 {'score': 0.01356252380584011, 'title': 'Final Fantasy Legend II'},
 {'score': 0.013395957891806143, 'title': 'Tabula Rasa (video game)'},
 {'score': 0.013230702911939729, 'title': 'Hot Shots Golf (series)'},
 {'score': 0.01314483283893686, 'title': 'Neutopia'},
 {'score': 0.012952913274225875, 'title': 'Battles (band)'},
 {'score': 0.012724472117589869, 'title': 'Last Order: Final Fantasy VII'},
 {'score': 0.012722718754266893, 'title': 'Final Fantasy Legend III'},
 {'score': 0.01245510395024374,
  'title': 'Ragnarok Online 2: The Gate of the World'},
 {'score': 0.01240167757489763, 'title': 'The Legend of Dragoon'}]

#### Query 2

In [30]:

query ='Butyllithium is commercially available as solutions'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: Butyllithium is commercially available as solutions


[{'score': 0.009969559113528592, 'title': 'N-Butyllithium'},
 {'score': 0.009284767947575379, 'title': 'Euler–Bernoulli beam theory'},
 {'score': 0.008325833087772017, 'title': 'ICT 1301'},
 {'score': 0.00821838000934072, 'title': 'Fluoride therapy'},
 {'score': 0.007949716759227954, 'title': 'Dubai Metro'},
 {'score': 0.007897685227115203, 'title': 'StatSoft'},
 {'score': 0.007835736635082302, 'title': 'Clinical audit'},
 {'score': 0.007802578144195382, 'title': 'Marlin Model Golden 39A'},
 {'score': 0.007684068104029884, 'title': 'Augusta, Ontario'},
 {'score': 0.007660139658023368, 'title': 'Gemini Ganesan'}]

#### Query 3

In [28]:
query ='In 2014, the RTA approved the recent proposal of extending the red line from Al Rashidiya station to Mirdif City Center'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: In 2014, the RTA approved the recent proposal of extending the red line from Al Rashidiya station to Mirdif City Center


[{'score': 0.017683325711255082, 'title': 'Dubai Metro'},
 {'score': 0.01414932582494699, 'title': 'Angolan Civil War'},
 {'score': 0.01295386573528623, 'title': 'Vale of Glamorgan Line'},
 {'score': 0.012757267090577118, 'title': 'Barry Island railway station'},
 {'score': 0.012709483315089476, 'title': 'London and Greenwich Railway'},
 {'score': 0.012623781890072294, 'title': 'Tiverton Parkway railway station'},
 {'score': 0.012587999362503322, 'title': 'GWR 5700 Class'},
 {'score': 0.012551711386156517, 'title': 'Rail transport in Indonesia'},
 {'score': 0.012498544394775046, 'title': 'Michele Bachmann'},
 {'score': 0.01248763570965794, 'title': 'Baumholder'}]

#### Query 4

In [27]:
query ='Mount Kenya National Park was established in 1949 to protect Mount Kenya, the wildlife and surrounding environment'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: Mount Kenya National Park was established in 1949 to protect Mount Kenya, the wildlife and surrounding environment


[{'score': 0.013040758618234727, 'title': 'Augusta, Ontario'},
 {'score': 0.012464897912184525, 'title': 'Wildlife of Costa Rica'},
 {'score': 0.012327917162507362, 'title': 'Mount Kenya National Park'},
 {'score': 0.011690337056265099, 'title': 'Rupununi'},
 {'score': 0.011623434074465644, 'title': 'Lake Nakuru'},
 {'score': 0.011386239915655127, 'title': 'Pilanesberg Game Reserve'},
 {'score': 0.011290887033561292, 'title': 'Ballygar'},
 {'score': 0.011265056149003527, 'title': 'Angolan Civil War'},
 {'score': 0.011161552577373168, 'title': 'Breakup of Yugoslavia'},
 {'score': 0.011119598591297963, 'title': 'Marine life'}]

#### Query 5

In [26]:
query ='The Eric Liddell Centre, a local charity, named after the 1924 Olympic 400m gold medalist athlete'
query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: The Eric Liddell Centre, a local charity, named after the 1924 Olympic 400m gold medalist athlete


[{'score': 0.013146725319662211, 'title': 'Morningside, Edinburgh'},
 {'score': 0.011378599602942963, 'title': 'Sisters of St. Joseph'},
 {'score': 0.011356502614479957, 'title': 'Augusta, Ontario'},
 {'score': 0.011350205461572394, 'title': 'Woburn Collegiate Institute'},
 {'score': 0.01126346462642236, 'title': 'Iowa Barnstormers'},
 {'score': 0.011135180202726191, 'title': 'Toofan (1989 film)'},
 {'score': 0.01113460414240175, 'title': 'Court Moor School'},
 {'score': 0.011118527807628897, 'title': 'Dizzy Mizz Lizzy'},
 {'score': 0.011101564340164294, 'title': 'List of Teen Titans characters'},
 {'score': 0.011064646180786286, 'title': 'Donie Bush'}]

#### Query 6

In [31]:
query ='Francis Campbell Boileau Cadell RSA 12 April 1883 – 6 December 1937) was a Scottish Colourist painter, renowned for his depictions of the elegant New Town interiors of his native Edinburgh, and for his work on Iona'

query_lookup(query, documents_dict, tf_idf_vector, bag_of_words, documents_title_dict)


Query: Francis Campbell Boileau Cadell RSA 12 April 1883 – 6 December 1937) was a Scottish Colourist painter, renowned for his depictions of the elegant New Town interiors of his native Edinburgh, and for his work on Iona


[{'score': 0.016990983402207838, 'title': 'Francis Cadell (artist)'},
 {'score': 0.014747844928845177, 'title': 'Sisters of St. Joseph'},
 {'score': 0.014507219676292094, 'title': 'P. C. Skovgaard'},
 {'score': 0.014433439467057663, 'title': 'Arthur Dove'},
 {'score': 0.014412506249721901, 'title': 'Harold Davidson'},
 {'score': 0.014321457924243997, 'title': 'Augusta, Ontario'},
 {'score': 0.014296134177969488, 'title': 'Ed Mirvish Theatre'},
 {'score': 0.013953297699574763, 'title': 'Angolan Civil War'},
 {'score': 0.013890122367402724, 'title': 'Brainpower'},
 {'score': 0.013849185262256536, 'title': 'Mary MacKillop'}]