In [None]:
import os
import math
import json
import sys
from collections import Counter
import pickle5 as pickle
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords


# Tokenizer
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
 
# Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

nltk.download('wordnet')
nltk.download('stopwords')

In [None]:
file_path = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
data_path = parent_dir + "/Dataset"

file_list = os.listdir(data_path)

stop_words = set(stopwords.words('english'))
# Adding custom words to the stop-words list
cust_stop_words = ["'s"]
for temp in cust_stop_words:
    stop_words.add(temp)

global inv_pos_index = {}

for i in range(len(file_list)):
    filtered_tokens = []
    txt_file = open(data_path + "/" + str(i) + ".html")
    soup = BeautifulSoup(txt_file, features="html.parser")
    text = soup.get_text()

    # Tokenizing
    tokens = tokenizer.tokenize(text.lower())

    # Stop-Word Removal + Lemmatization
    for token in tokens:
        if(token not in stop_words):
            filtered_tokens.append(lemmatizer.lemmatize(token))

    # Finding term-freq of each token in the token-list
    token_freq = Counter(filtered_tokens)

    # Adding the (d,tf) pair to the Inverted Positional Index
    for token in token_freq:
        tf = math.log10(1+token_freq[token])
        if(token not in inv_pos_index):
            inv_pos_index[token] = [0,tuple(i,round(tf,5))]
        else:
            inv_pos_index[token].append(tuple(i,round(tf,5)))


# Computing & Storing the idf value for each term
for token in inv_pos_index:
    df = len(inv_pos_index[token])-1
    idf = math.log10(float(len(file_list)/df))
    inv_pos_index[token][0] = idf

with open("Inv_Pos_Index.json", 'w') as fp:
    json.dump(inv_pos_index, fp, sort_keys=True, indent=3)

## Building Champion Lists

- **ChampionListLocal:** Top 50 tf value document list per-term.

- **ChampionListGlobal:** Top 50 tf + g(d) value document list per-term. 


***NOTE:*** g(d) values are taken from StaticQualityScore.pkl

In [None]:
# Temporary Code -------------------------------------------------------------------------------------------------
file_path = os.getcwd()
parent_dir = os.path.dirname(os.getcwd())
data_path = parent_dir + "/Dataset"

stop_words = set(stopwords.words('english'))

with open("Inv_Pos_Index.json", 'r') as fp:
    inv_pos_index = json.load(fp)

file_list = os.listdir(data_path)
# Temporary Code --------------------------------------------------------------------------------------------------

In [None]:


# Champion List Local
global champ_list_local = {}
for token in inv_pos_index:
    temp = inv_pos_index[token][1:]
    temp.sort(key = lambda x: x[1],reverse = True)
    temp = [tuple(doc) for doc in temp[:min(50,len(temp))]]
    champ_list_local[token] = temp


# Champion List Global
global champ_list_global = {}
with open(parent_dir + '/StaticQualityScore.pkl','rb') as fp:
    g = pickle.load(fp)

    for token in inv_pos_index:
        temp = inv_pos_index[token][1:]
        temp.sort(key = lambda x: x[1] + g[x[0]],reverse = True)
        temp = [tuple(doc)  for doc in temp[:min(50,len(temp))]]        
        champ_list_global[token] = temp

# Calculating sum of squares
Vd_norm = [0]*len(file_list)

for token in inv_pos_index:
    temp = inv_pos_index[token]
    for i in range(1,len(temp)):
        doc = temp[i][0]
        tf_idf = temp[i][1] * temp[0]
        Vd_norm[doc] = Vd_norm[doc] + (tf_idf**2)
    
#  Calculating sqrt for Vd_norm
for val in Vd_norm:
    val = math.sqrt(val)

## Query Processing & Scoring

In [46]:
with open("query.txt","r") as query_file:
# with open(sys.argv[1],"r") as query_file:  # For taking the commandline input for the query text file
    query_list = query_file.readlines()

def tf_idf(filtered_query):
    score_dict = {}
    Vq_norm = 0

    # Calculating |Vq|
    for query in filtered_query:
        Vq_norm = Vq_norm + inv_pos_index[query][0]
    Vq_norm = math.sqrt(Vq_norm)    

    # Calculating |Vq|
    for query in filtered_query:
        doc_list = inv_pos_index[query][1:]
        for doc in doc_list:
            tf = doc[1]
            idf  = inv_pos_index[query][0]
            tf_idf = (idf * (tf*idf))/(Vq_norm * Vd_norm[doc[0]])
        
            if(doc[0] in score_dict):
                score_dict[doc[0]] = score_dict[doc[0]] + tf_idf
        
            else:
                score_dict[doc[0]] = tf_idf

    score_dict = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    
    return score_dict[:10]

def champ(filtered_query, score_type):
    if(score_type == 'local'):
        champ_dict = champ_list_local
    elif(score_type == 'global'):
        champ_dict = champ_list_global
    else:
        print("champ_score() wrong call")
        return

    score_dict = {}
    Vq_norm = 0

    # Calculating |Vq|
    for query in filtered_query:
        Vq_norm = Vq_norm + inv_pos_index[query][0]
    Vq_norm = math.sqrt(Vq_norm)    

    # Calculating |Vq|
    for query in filtered_query:
        doc_list = champ_dict[query]
        for doc in doc_list:
            tf = doc[1]
            idf  = inv_pos_index[query][0]
            tf_idf = (idf * (tf*idf))/(Vq_norm * Vd_norm[doc[0]])
        
            if(doc[0] in score_dict):
                score_dict[doc[0]] = score_dict[doc[0]] + tf_idf
        
            else:
                score_dict[doc[0]] = tf_idf
    score_dict = sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
    
    return score_dict[:10]

def list_to_string(score_list):
    text = ""
    for doc in score_list:
        text = text + '<' + str(doc[0]) + ',' +str(round(doc[1],5)) + '>,'

    text = text[:-1] + '\n'
    return text


with open("RESULTS1_18CH3FP07.txt","wt") as text_file:
    for query in query_list:
        filtered_query = []

        query = query.rstrip("\n")
        # Tokenizing
        tokens = tokenizer.tokenize(query.lower())

        # Stop-Word Removal + Lemmatization
        for token in tokens:
            if(token not in stop_words):
                filtered_query.append(lemmatizer.lemmatize(token))
        tf_idf_score = tf_idf(filtered_query)
        champ_local_score = champ(filtered_query, 'local')
        champ_global_score = champ(filtered_query, 'global')

        #  Writing query data
        text_file.write(query + "\n")
        text_file.write(list_to_string(tf_idf_score))
        text_file.write(list_to_string(champ_local_score))
        text_file.write(list_to_string(champ_global_score))  

        text_file.write("\n")      

In [32]:
print(champ_list_local['year'])

[(793, 2.30103), (55, 2.20412), (143, 2.15229), (567, 2.11394), (882, 2.10037), (372, 2.09342), (726, 2.08279), (604, 2.07555), (429, 2.06446), (443, 2.06446), (574, 2.06446), (870, 2.0607), (477, 2.03743), (928, 2.03342), (179, 2.02531), (485, 2.01284), (393, 2.0086), (359, 2.00432), (522, 2.00432), (584, 2.0), (864, 1.99564), (656, 1.99123), (173, 1.98677), (442, 1.98677), (683, 1.98677), (842, 1.98677), (776, 1.98227), (135, 1.97313), (845, 1.97313), (25, 1.96379), (186, 1.96379), (541, 1.96379), (658, 1.96379), (21, 1.95904), (220, 1.95424), (371, 1.95424), (572, 1.95424), (791, 1.95424), (397, 1.94939), (71, 1.93952), (377, 1.93952), (948, 1.93952), (36, 1.9345), (183, 1.9345), (30, 1.92942), (492, 1.92942), (767, 1.92942), (895, 1.92942), (128, 1.92428), (720, 1.92428)]
