In [4]:
import sys
import re
import copy
import json
import string

import numpy as np


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giorgio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from ir_preprocess import preprocess_text, words_from_text

In [7]:
def load_indices():
    with open('inv_index.json') as f:
        [n_docs, inv_index] = json.load(f)
    with open('index.json') as f:
        [n_docs, index] = json.load(f)
    with open('titles.json') as f:
        titles = json.load(f)
    with open('charmap.json') as f:
        charmap = json.load(f)
    return index, inv_index, titles, charmap, n_docs

In [8]:
def query(q, index, inv_index, titles, n_docs, charmap):
    q_text = ' '.join(q)
    t = preprocess_text(q_text, charmap)
    query_terms_list, query_terms_set = words_from_text(t)
    all_scores = get_all_scores(query_terms_list, index, inv_index, n_docs)
    sorted_list = get_sorted_result(all_scores)
    output_result(sorted_list, titles)

In [9]:
def output_result(results, titles):
    if len(results.keys()) == 0:
        print('No match')
    else:
        for k, v in results.items():
            print('{0:s}: {1:5.3f}'. format(titles[k], v))

In [10]:
def get_sorted_result(all_scores):
    more_matches = 0
    more_matches_term = None
    for k, d in all_scores.items():
        if len(all_scores[k]) > more_matches:
            more_matches = len(all_scores[k])
            more_matches_term = k
    if more_matches_term is None:
        return {}
    result = copy.deepcopy(all_scores[more_matches_term])
    for k, docs in all_scores.items():
        if k is not more_matches_term:
            for key in docs.keys():
                if key in result.keys():
                    result[key] += docs[key]
                else:
                    result[key] = docs[key]
    sorted_result = {k: v for k, v in sorted(result.items(), key=lambda x: x[1], reverse=True)}
    return sorted_result

In [11]:
def get_all_scores(query_terms_list, index, inv_index, n_docs):
    scores = {}
    for t in query_terms_list:
        scores[t] = doc_scores(t, index, inv_index, n_docs)
    return scores

In [12]:
def doc_scores(t, index, inv_index, n_docs):
    term_scores = {}
    if inv_index.get(t):
        for doc in inv_index[t]['tf'].keys():
            term_scores[doc] = score(t, doc, index, inv_index, n_docs)
    return term_scores

In [13]:
def score(t, key, index, inv_index, n_docs):
     scr = tf(t,key, inv_index)*idf(t, inv_index, n_docs)/normalize(t, key, index)
     return scr

In [14]:
def tf(t, key, inv_index):
    val = inv_index[t]['tf'][key]
    return 1+np.log10(val)

In [15]:
def idf(t, inv_index, n_docs):
    val = inv_index[t]['df']
    return np.log10(n_docs/val)

In [16]:
def normalize(t, key, index):
    return index[key]['norm']

In [21]:
def main():
    index, inv_index, titles, charmap, n_docs = load_indices()

    q = ['Legolas', 'Frodo']

    query(q, index, inv_index, titles, n_docs, charmap)

In [23]:
index, inv_index, titles, charmap, n_docs = load_indices()
q = ['frodo']
query(q, index, inv_index, titles, n_docs, charmap)

Letter 85 From an airgraph to Christopher Tolkien : 0.105
Letter 59 From an airgraph to Christopher Tolkien  : 0.097
Letter 84 From an airgraph to Christopher Tolkien : 0.092
Letter 68 From an airgraph to Christopher Tolkien  : 0.091
Letter 195 From a letter to Amy Ronald  : 0.091
Letter 295 To W.H.Auden : 0.087
Letter 192 From a letter to Amy Ronald  : 0.079
Letter 67 From an airgraph to Christopher Tolkien  : 0.074
Letter 269 From a letter to W. H. Auden : 0.074
Letter 100 From a letter to Christopher Tolkien : 0.069
Letter 91 To Christopher Tolkien : 0.068
Letter 191 From a letter to Miss J. Bum (draft)  : 0.064
Letter 93 From a letter to Christopher Tolkien  : 0.063
Lord of the Rings: The two towers - The Forbidden Pool: 0.062
Lord of the Rings: The return of the shadow - Strider: 0.061
Letter 246 From a letter to Mrs Eileen Elgar (drafts) : 0.061
Lord of the Rings: The return of the shadow - A Conspiracy Unmasked: 0.061
Lord of the Rings: The return of the King - The Grey Havens: 