In [80]:
import spacy
import json
import re


nlp = spacy.load("en_core_web_sm")

In [81]:
vectorized_file = open("data/vectorized.json")
vectorized = json.load(vectorized_file)

In [82]:
vocab_file = open("data/vocab.json")
vocab = json.load(vocab_file)

In [83]:
def build_corpus_vocabulary(documents_with_tf_idfs, vocabulary):
  inverted_index = dict()

  for index, token in enumerate(sorted(vocabulary.keys())):
    inverted_index[token] = []
    for document in documents_with_tf_idfs:
      tf_idf = document["tf_idfs"][index]
      if tf_idf > 0:
        inverted_index[token].append((document["title"], tf_idf))

  return inverted_index

In [84]:
inverted_index = build_corpus_vocabulary(vectorized, vocab)

inverted_index["flu"]

[('Pandemic', 0.3611111111111111),
 ('Spanish flu', 2.1666666666666665),
 ('Swine influenza', 3.611111111111111),
 ('Unified Victim Identification System', 0.3611111111111111)]

In [85]:
def tokenizer(string):
    lowercased = string.lower()
    lowercased = re.sub('(\n|\\\\displaystyle)', '', lowercased)
    doc = nlp(lowercased)
    lemmas = []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space:
            lemmas.append(token.lemma_)
    return lemmas

In [86]:
from collections import defaultdict
from itertools import chain

def search_inverted_index(query, inverted_index_dict):
  query_token = tokenizer(query)
  results = []
  for token in query_token:
    results.append(inverted_index_dict[token])

  results = list(chain(*results))
  calculated_results = defaultdict(int)
  for document, score in results:
      calculated_results[document] += score

  calculated_results = [(key, value) for key, value in calculated_results.items()]
  calculated_results = sorted(calculated_results, key=lambda x: x[1], reverse=True)
  return calculated_results

In [87]:
search_inverted_index("Spanish flu", inverted_index, )

[('Spanish flu', 7.944444444444443),
 ('Swine influenza', 5.055555555555555),
 ('Pandemic', 1.8055555555555554),
 ('Unified Victim Identification System', 0.3611111111111111)]

In [88]:
with open("data/data_vectorized.json", "w") as file:
  json.dump(inverted_index, file, indent=2)