In [54]:
import spacy
import json
import re
from itertools import chain
from collections import Counter


nlp = spacy.load("en_core_web_sm")

In [55]:
f = open("data/processed.json")
data = json.load(f)

data[0]

{'title': 'Pandemic',
 'text': 'A pandemic (from Greek πᾶν, pan, "all" and δῆμος, demos, "people") is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people. A widespread endemic disease with a stable number of infected people is not a pandemic. Widespread endemic diseases with a stable number of infected people such as recurrences of seasonal influenza are generally excluded as they occur simultaneously in large regions of the globe rather than being spread worldwide.\nThroughout human history, there have been a number of pandemics of diseases such as smallpox and tuberculosis. The most fatal pandemic in recorded history was the Black Death (also known as The Plague), which killed an estimated 75–200 million people in the 14th century. The term was not used yet but was for later pandemics including the 1918 influenza pandemic (Spanish flu). Current pandemics include COVID-19 (S

In [56]:

tokens = list(chain(*[i["tokenized_text"] for i in data]))

In [57]:
def compute_tfids(document, corpus_vocabulary, documents):
  tf_idf_vector = []
  for token in sorted(corpus_vocabulary.keys()):
    tf = Counter(document["tokenized_text"])[token] / Counter(corpus_vocabulary)[token]
    idf = len(documents) / Counter(list(chain(*[set(document["tokenized_text"]) for document in documents])))[token]
    tf_idf = tf * idf
    tf_idf_vector.append(tf_idf)

  return tf_idf_vector

In [58]:
corpus = Counter(tokens)

In [59]:
tf_idf_documents = [dict(document, **{'tf_idfs': compute_tfids(document, corpus, data)}) for document in data]

In [67]:
tf_idf_documents[10]["tf_idfs"]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 26.0,
 6.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 26.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 3.466666666666667,
 0.0,
 0.0,
 0.0,
 26.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 26.0,
 6.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 6.5,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 8.666666666666666,
 0.0,
 0.0,
 0.0,
 0.0,
 26.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 

In [60]:
def tokenizer(string):
  lowercased = string.lower()
  lowercased = re.sub('(\n|\\\\displaystyle)', '', lowercased)
  doc = nlp(lowercased)
  lemmas = []
  for token in doc:
    if not token.is_stop and not token.is_punct and not token.is_space:
      lemmas.append(token.lemma_)
  return lemmas

In [61]:
import numpy as np
from numpy.linalg import norm

query_text = "When did the Spanish flu happen?"
query_text_token = tokenizer(query_text)
query_dict = {"tokenized_text": query_text_token}
tf_idf_query = compute_tfids(query_dict, corpus, data)
tf_idf_query_vector = np.array(tf_idf_query)
tf_idf_document_vector = np.array(tf_idf_documents[0]["tf_idfs"])
cosine_similarity = np.dot(tf_idf_query_vector, tf_idf_document_vector) / (norm(tf_idf_query_vector) * norm(tf_idf_document_vector))
cosine_similarity

0.012649148633172385

In [62]:
def cosine_similarity(query, documents):
  query_token = tokenizer(query)
  query_dict = {"tokenized_text": query_token}
  tf_idf_query = compute_tfids(query_dict, corpus, data)
  tf_idf_query_vector = np.array(tf_idf_query)
  cosine_similarities = []
  for document in documents:
    tf_idf_document_vector = np.array(document["tf_idfs"])
    cosine_similarity = np.dot(tf_idf_query_vector, tf_idf_document_vector) / (norm(tf_idf_query_vector) * norm(tf_idf_document_vector))
    cosine_similarities.append({'title': document["title"], 'cosine_similarity': cosine_similarity})

  return sorted(cosine_similarities, key=lambda i: i["cosine_similarity"], reverse=True)

In [63]:
cosine_similarity(query_text, tf_idf_documents)

[{'title': 'Spanish flu', 'cosine_similarity': 0.02858469708927138},
 {'title': 'Swine influenza', 'cosine_similarity': 0.012855228526745734},
 {'title': 'Pandemic', 'cosine_similarity': 0.012649148633172385},
 {'title': 'Unified Victim Identification System',
  'cosine_similarity': 0.0004685996984836988},
 {'title': 'Epidemiology of HIV/AIDS', 'cosine_similarity': 0.0},
 {'title': 'Antonine Plague', 'cosine_similarity': 0.0},
 {'title': 'Basic reproduction number', 'cosine_similarity': 0.0},
 {'title': 'Bills of mortality', 'cosine_similarity': 0.0},
 {'title': 'Cholera', 'cosine_similarity': 0.0},
 {'title': 'COVID-19 pandemic', 'cosine_similarity': 0.0},
 {'title': 'Crimson Contagion', 'cosine_similarity': 0.0},
 {'title': 'Disease X', 'cosine_similarity': 0.0},
 {'title': 'Event 201', 'cosine_similarity': 0.0},
 {'title': 'HIV/AIDS', 'cosine_similarity': 0.0},
 {'title': 'HIV/AIDS in Yunnan', 'cosine_similarity': 0.0},
 {'title': 'Pandemic prevention', 'cosine_similarity': 0.0},
 {

In [64]:
with open("data/vectorized.json", "w") as file:
  json.dump(tf_idf_documents, file, indent=2)

In [65]:
with open("data/vocab.json", "w") as file:
  json.dump(corpus, file, indent=2)