In [9]:
import indexer
import csv
from collections import defaultdict
from tqdm import tqdm
import pandas as pd
import pickle
import re

### Creating the indexes

In [7]:
index = {}

print("Registering entities")
with open('../../gena_data_final_triples.csv', 'r') as file:
    reader = csv.reader(file)
    next(reader)
    for row in tqdm(reader, total=1026316):
        for entity in row:
            if entity not in index:
                index[entity] = defaultdict(int)

with open('index_only_entities.pkl', 'wb') as file:
    pickle.dump(index, file)

index

Registering entities


100%|█████████▉| 1026307/1026316 [00:02<00:00, 428743.68it/s]


{'Flibanserin': defaultdict(int, {}),
 'Type_E1': defaultdict(int, {}),
 'CHEMICAL': defaultdict(int, {}),
 'Sentence': defaultdict(int, {}),
 'Flibanserin, a multifunctional serotonin receptor agonist and antagonist, is currently approved in the United States and Canada for the treatment of acquired, generalized hypoactive sexual desire disorder (HSDD) in premenopausal women.': defaultdict(int,
             {}),
 'ID_1': defaultdict(int, {}),
 'C098107': defaultdict(int, {}),
 'Full_E1': defaultdict(int, {}),
 'MeSH_E1': defaultdict(int, {}),
 'flibanserin [Supplementary Concept]': defaultdict(int, {}),
 'Synonyms_1': defaultdict(int, {}),
 'Benzimidazoles': defaultdict(int, {}),
 'multifunctional serotonin receptor agonist': defaultdict(int, {}),
 'NEW00001': defaultdict(int, {}),
 'No synonyms': defaultdict(int, {}),
 'Obesity': defaultdict(int, {}),
 'DISEASE': defaultdict(int, {}),
 'Obesity is associated with reduced testosterone and worsened erectile and sexual function in men.'

In [None]:
# Checkpoint
with open('index_only_entities.pkl', 'rb') as file:
    index = pickle.load(file)

In [12]:
print("Creating indexes")
with open('../../nfcorpus/raw/doc_dump.txt', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    entities = list(index.keys())
    for row in tqdm(reader, total=5371):
        row_text = ' '.join(row[2:])
        for entity in entities:
            if entity in row_text:
                index[entity][row[0]] += 1

with open('index_complete.pkl', 'wb') as file:
    pickle.dump(index, file)

index

Creating indexes


100%|██████████| 5371/5371 [10:47<00:00,  8.29it/s]


{'Flibanserin': defaultdict(int, {}),
 'Type_E1': defaultdict(int, {}),
 'CHEMICAL': defaultdict(int, {}),
 'Sentence': defaultdict(int, {}),
 'Flibanserin, a multifunctional serotonin receptor agonist and antagonist, is currently approved in the United States and Canada for the treatment of acquired, generalized hypoactive sexual desire disorder (HSDD) in premenopausal women.': defaultdict(int,
             {}),
 'ID_1': defaultdict(int, {}),
 'C098107': defaultdict(int, {}),
 'Full_E1': defaultdict(int, {}),
 'MeSH_E1': defaultdict(int, {}),
 'flibanserin [Supplementary Concept]': defaultdict(int, {}),
 'Synonyms_1': defaultdict(int, {}),
 'Benzimidazoles': defaultdict(int, {}),
 'multifunctional serotonin receptor agonist': defaultdict(int, {}),
 'NEW00001': defaultdict(int, {}),
 'No synonyms': defaultdict(int, {}),
 'Obesity': defaultdict(int,
             {'MED-17': 2,
              'MED-97': 2,
              'MED-167': 2,
              'MED-168': 2,
              'MED-169': 2,
 

### Reading the indexes

In [None]:
with open('index_complete.pkl', 'rb') as file:
    index = pickle.load(file)

### Using the indexes

In [26]:
query = "relations between obesity and cancer"

# Step 1: Tokenize the query
query_tokens = [entity for entity in index.keys() if re.search(r'\b' + re.escape(entity) + r'\b', query)]
if '' in query_tokens:
    query_tokens.remove('')
print(query_tokens)

# Step 2: Find documents containing query tokens
print("Finding documents containing query tokens")
doc_scores = defaultdict(int)
for token in tqdm(query_tokens):
    if token in index:
        for doc, freq in index[token].items():
            doc_scores[doc] += freq

# Step 3: Sort documents by score
sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)

# Read titles from doc_dump.txt
titles = {}
with open('../../nfcorpus/raw/doc_dump.txt', 'r') as file:
    reader = csv.reader(file, delimiter='\t')
    for row in tqdm(reader, total=5371):
        doc_id = row[0]
        title = row[2]
        titles[doc_id] = title

# Step 4: Print sorted documents
for doc, score in sorted_docs:
    print(f"Document ID: {doc}, Title: {titles[doc]}, Score: {score}")

['obesity', 'cancer']
Finding documents containing query tokens


100%|██████████| 2/2 [00:00<00:00, 2042.02it/s]
100%|██████████| 5371/5371 [00:00<00:00, 77688.37it/s]

Document ID: MED-63, Title: Leucine signaling in the pathogenesis of type 2 diabetes and obesity, Score: 4
Document ID: MED-126, Title: Endocrine-Disrupting Chemicals: Associated Disorders and Mechanisms of Action, Score: 4
Document ID: MED-173, Title: Health and economic burden of the projected obesity trends in the USA and the UK. - PubMed - NCBI, Score: 4
Document ID: MED-510, Title: An epidemiologic study of thyroid cancer in Hawaii. - PubMed - NCBI, Score: 4
Document ID: MED-590, Title: Greater Survival After Breast Cancer in Physically Active Women With High Vegetable-Fruit Intake Regardless of Obesity, Score: 4
Document ID: MED-2, Title: A statistical regression model for the estimation of acrylamide concentrations in French fries for excess lifetime cancer risk assessment. - PubMed - NCBI, Score: 3
Document ID: MED-48, Title: Agricultural policies, food and public health, Score: 2
Document ID: MED-58, Title: Macronutrient balance and lifespan, Score: 2
Document ID: MED-97, Titl


