# Part2: Vector-space based IR System
## Objectives: 
### 1. Building Inverted Index
### 2. Saving the python dictionaries as pickle files to be used for query optimization

In [2]:
# Import Required Libraries
import re
import string
from collections import Counter
import nltk
import numpy as np
import pickle
from bs4 import BeautifulSoup
from scipy import spatial

In [9]:
cleaned_text = 'Its father-in-law I\'m James Leslie "Hippo" Vaughn (April 9, 1888 – May 29, 1966)  Brisbane\'s was an American left-handed'
tokens = nltk.word_tokenize(cleaned_text)
print(tokens)

['Its', 'father-in-law', 'I', "'m", 'James', 'Leslie', '``', 'Hippo', "''", 'Vaughn', '(', 'April', '9', ',', '1888', '–', 'May', '29', ',', '1966', ')', 'Brisbane', "'s", 'was', 'an', 'American', 'left-handed']


In [2]:
# remove punctuation from each word
def remove_punctuation(words):
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    return ''.join(stripped)

In [3]:
# remove special not needed characters from each word
def remove_special_characters(s):
    # removes special characters with ' '    
    stripped = re.sub('[^a-zA-z0-9\s]', '', s)

    # Remove underscore
    stripped = re.sub('_', '', stripped)
    # Change any white space to one space including new line
    stripped = re.sub('\s+', ' ', stripped)

    # Remove start and end white spaces
    stripped = stripped.strip()
    if stripped != '':
        return stripped.lower()

In [4]:
# Basic text Preprocessing 
def text_preprocessing(documents_dict):
    for key, value in documents_dict.items():
        clean_text_1 = remove_punctuation(value)
        clean_text_2 = remove_special_characters(clean_text_1)
        clean_text_2.replace("[(,.]", "")
        documents_dict[key] = clean_text_2

In [5]:
# Function to read document file...file used is "wiki_62"
def read_corpus_file(documents_dict):        
        with open('wiki_62', 'r', encoding='utf8') as corpus_file:
            corpus_soup = BeautifulSoup(corpus_file.read(), 'html.parser')
            for doc in corpus_soup.find_all('doc'):
                documents_dict[int(doc['id'])] = doc.get_text()
                documents_title_dict[int(doc['id'])] = ''.join(list(doc.get("title")))

In [6]:
# Function to make complete bag of words model
def make_bag_of_words(bag_of_words, documents_dict):        
    for doc_id, doc_text in documents_dict.items():
        tokens = nltk.word_tokenize(doc_text)
        for token in tokens:
            if bag_of_words.get(token):
                bag_of_words[token]['doc_ids'].append(doc_id)
            else:
                bag_of_words[token] = {'doc_ids': [doc_id]}        
    for key, value in bag_of_words.items():
         bag_of_words[key]['df'] = len(list(set(bag_of_words[key]['doc_ids'])))

### Build inverted index using SMART Notation
#### SMART Notation used : lnc.ltc(ddd.qqq)
#### ltc ==> Logarithmic tf +  NO IDF + Cosine Normalization

In [7]:
# Building inverted index using dictionary
def build_inverted_index(tf_idf_vector, documents_dict, bag_of_words):
    documents_count = len(documents_dict.keys())
    total_tokens = list(bag_of_words.keys())
    
    tokens_with_index = {}
    for index, token in enumerate(total_tokens):
        tokens_with_index[token] = index

    for doc_id, doc_text in documents_dict.items():
        doc_tokens = nltk.word_tokenize(doc_text)
        doc_tokens_counter = Counter(doc_tokens)
        doc_tokens_count = len(doc_tokens)
        
        np_token_array = np.zeros((len(total_tokens)))
        tf_idf_vector[doc_id] = {'tf_idf_vector': np_token_array}
        cnt = 0
        for token, value in bag_of_words.items():
            # Adding 1 to avoid divide by Zero warning
            #Logarithmic tf 
            tf = 1 + np.log10(doc_tokens_counter[token] + 1)
            cnt += tf * tf
        cnt = np.sqrt(cnt)
        for token, value in bag_of_words.items():
            # Adding 1 to avoid divide by Zero warning
            tf = 1 + np.log10(doc_tokens_counter[token]+1)
            # Cosine Normalization
            tf_idf_vector[doc_id]['tf_idf_vector'][tokens_with_index[token]] = tf / cnt
        tf_idf_vector[doc_id]['tf_idf_vector'] = tf_idf_vector[doc_id]['tf_idf_vector'].tolist()
    return tf_idf_vector

In [8]:
# Test function to print dictionary for initial two elements for debugging
def print_dictionary(dictionary):
    count = 0
    for key, value in dictionary.items():
        if count == 2:
            break
        count += 1
        print("key:", key, "Value:", value)

In [9]:
# Main Initialization Code
if __name__ == "__main__":
    
    documents_dict = {}
    tf_idf_vector = {}    
    bag_of_words = {}
    documents_title_dict = {}
    
    read_corpus_file(documents_dict)
    text_preprocessing(documents_dict)
    #print_dictionary(documents_dict)
    make_bag_of_words(bag_of_words, documents_dict)
    #print_dictionary(bag_of_words)
    build_inverted_index(tf_idf_vector, documents_dict, bag_of_words)
    #print_dictionary(tf_idf_vector)
    with open('documents_dict.pkl', 'wb') as handle:
        pickle.dump(documents_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('tf_idf_vector.pkl', 'wb') as handle:
        pickle.dump(tf_idf_vector, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('bag_of_words.pkl', 'wb') as handle:
        pickle.dump(bag_of_words, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open('documents_title_dict.pkl', 'wb') as handle:
        pickle.dump(documents_title_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)