In [1]:
import os
import json
import nltk 
import numpy as np
import string
import time

from IPython.display import clear_output
from nltk.corpus import stopwords 
# reason for using snowball: https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/gustaw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gustaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
questions_data_path = '../../data/medqa/questions/US_qbank.jsonl'
dev_questions_data_path = '../../data/medqa/questions/dev.jsonl'
textbooks_data_dir = '../../data/medqa/textbooks/'

questions_dev_medqa_path = '../../data/medqa/questions/4_options/dev.jsonl'
questions_train_medqa_path ='../../data/medqa/questions/4_options/train.jsonl'
questions_test_medqa_path ='../../data/medqa/questions/4_options/test.jsonl'

In [3]:
import pickle
def save_data(data, file_path):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        return pickle.load(handle)

In [6]:
questions_dev = []
questions_train = []
questions_test = []

with open(questions_dev_medqa_path, 'r') as file:
    for line in file:
        questions_dev.append(json.loads(line))

with open(questions_train_medqa_path, 'r') as file:
    for line in file:
        questions_train.append(json.loads(line))
        
with open(questions_test_medqa_path, 'r') as file:
    for line in file:
        questions_test.append(json.loads(line))    

corpus = {}
for textbook_name in os.listdir(textbooks_data_dir):
    textbook_path = textbooks_data_dir + '/' + textbook_name
    with open(textbook_path, 'r') as textbook_file:
        textbook_content = textbook_file.read()
        corpus[textbook_name] = textbook_content

In [5]:
stop_words = stopwords.words('english')
snowball_stemmer = SnowballStemmer(language='english') 
# do not remove the '-' and '/'
custom_string_punctuation = string.punctuation.replace('-','').replace('/','').replace('.','')
punctuation = str.maketrans('', '', custom_string_punctuation)

In [6]:
def preprocess_content(content, remove_stopwords, stemming, remove_punctuation):
    if not remove_stopwords and not stemming and not remove_punctuation:
        return content.lower().strip()
    if remove_punctuation:
        content = content.translate(punctuation).replace('“','').replace('’','')
    sentences = nltk.sent_tokenize(content.lower().strip())
    cleaned_sentences = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        if remove_stopwords:
            tokens = [x for x in tokens if x not in stop_words]
        if stemming:
            tokens = [snowball_stemmer.stem(x) for x in tokens]
        cleaned_sentences.append(' '.join(tokens))
            
    return ' '.join(cleaned_sentences)

def preprocess_corpus(corpus, remove_stopwords, stemming, remove_punctuation):
    for name, content in tqdm(corpus.items()):
        # TODO: removal of non-medical terms using MetaMap
        corpus[name] = preprocess_content(content, remove_stopwords, stemming, remove_punctuation)
        
        
def preprocess_questions(questions, remove_stopwords, stemming, remove_punctuation, metamap=False):    
    for question in tqdm(questions):
        x = preprocess_content(question['question'], remove_stopwords, stemming, remove_punctuation)
        question['question'] = x 
        for option, value in question['options'].items():
            question['options'][option] = preprocess_content(value, remove_stopwords, stemming, remove_punctuation)
        if metamap:
            question['answer'] = preprocess_content(question['answer'], remove_stopwords, stemming, remove_punctuation)
            for i, phrase in enumerate(question['metamap_phrases']):
                question['metamap_phrases'][i] = preprocess_content(phrase, remove_stopwords, stemming, remove_punctuation)

In [7]:
def preprocess_data(remove_stopwords, stemming, remove_punctuation, metamap):
    preprocess_corpus(
        corpus=corpus,
        remove_stopwords=remove_stopwords,
        stemming=stemming,
        remove_punctuation=remove_punctuation
    )
    preprocess_questions(
        questions=questions_metamap_data,
        remove_stopwords=remove_stopwords,
        stemming=stemming,
        remove_punctuation=remove_punctuation,
        metamap=metamap
    )

In [15]:
preprocess_questions(
    questions=questions_dev,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

preprocess_questions(
    questions=questions_train,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

100%|██████████| 1272/1272 [00:08<00:00, 153.90it/s]
100%|██████████| 10178/10178 [01:06<00:00, 152.99it/s]


In [70]:
preprocess_corpus(
        corpus=corpus,
        remove_stopwords=False,
        stemming=True,
        remove_punctuation=False
    )

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [7]:
def create_corpus_chunks(chunk_length):
    corpus_chunks = []
    for title, content in tqdm(corpus.items()):

        content_tokens = word_tokenize(content)

        counter = 0
        for i in range(0, len(content_tokens), chunk_length):
            chunk_name = title + str(counter)
            chunk = ' '.join(content_tokens[i:i+chunk_length])
            chunk_processed = preprocess_content(chunk, False, False, False)
            stemmed_chunk_processed = preprocess_content(chunk, False, True, False)
            entry = {
                'name': chunk_name,
                'content': chunk_processed,
                'content_stemmed': stemmed_chunk_processed
            }
            corpus_chunks.append(entry)
            counter += 1
    
    return corpus_chunks

In [8]:
chunk_length = 100
corpus_chunks_100 = create_corpus_chunks(chunk_length)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




NameError: name 'snowball_stemmer' is not defined

In [None]:
corpus_chunks_50 = create_corpus_chunks(50)

In [None]:
def create_corpus_sentences():
    corpus_sentences = []
    for title, content in tqdm(corpus.items()):
        content_sentences = (nltk.sent_tokenize(content))
        sentence_counter = 0
        for sentence in content_sentences:
            corpus_sentences.append({
                'name': title + str(sentence_counter),
                'content': sentence
            })
    return corpus_sentences

In [None]:
corpus_sentences = create_corpus_sentences()

In [16]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

def upload_documents(documents, index_name):
    create_index_body = """{
        "settings": {
            "index": {
                "similarity": {
                    "default": {
                        "type": "BM25"
                    }
                }
            }
        }
    }"""
    print(create_index_body)
    es.indices.create(index=index_name, body=create_index_body)

    id_counter = 1
    for document in tqdm(documents):
        res = es.index(index=index_name, id=id_counter, body=document)
        id_counter += 1

In [11]:
from enum import Enum

class Indexes(Enum):
    Unprocessed_chunks_100 = "unprocessed-chunks-100",
    Unprocessed_sentences = "sentences-unprocessed-shards-1",
    Stemmed_sentences = "sentences-stemmed-shards-1"
    MedQA_stemmed_chunks = "medqa-stemmed-chunks",
    MedQA_unprocessed_chunks = "medqa-unprocessed-chunks",
    MedQA_chunks_100 = "medqa-chunks-100"
    MedQA_chunks_50 = "medqa-chunks-50"
    #stemming-punctuation

In [26]:
upload_documents(corpus_chunks_100, Indexes.MedQA_chunks_100.value)
upload_documents(corpus_chunks_50, Indexes.MedQA_chunks_50.value)
upload_documents(corpus_sentences, Indexes.Unprocessed_sentences.value)

{
        "settings": {
            "index": {
                "similarity": {
                    "default": {
                        "type": "BM25"
                    }
                }
            }
        }
    }


HBox(children=(FloatProgress(value=0.0, max=303096.0), HTML(value='')))




In [8]:
def search_documents(query_input, n, index_name):
    res = es.search(
        index=index_name, 
        body={
            "query": {
                "match": {
                    "content": query_input
                }
            },
            "from": 0,
            "size": n
        }
    )
    
    number_of_hits = len(res['hits']['hits'])
    
    results = []
    for i in range(number_of_hits):
        score = res['hits']['hits'][i]['_score']
        paragraph = res['hits']['hits'][i]['_source']
        result = {
            "score": score,
            "evidence": paragraph
        }
        results.append(result)
        
    return results

## Creating the collection of question: list of lists of documents retrieved per each option answer

In [27]:
def get_documents_from_elasticsearch(questions, num_of_docs, index):
    retrieved_documents = {}
    print(len(questions))
    for idx, question_data in enumerate(tqdm(questions)):
        question_documents = {}
        question_id = "q" + str(idx)
        for option, option_answer in question_data['options'].items():
            query = ' '.join(question_data['metamap_phrases']) + ' ' + option_answer
            top_documents = search_documents(query, num_of_docs, index)
            question_documents[option_answer] = top_documents

        retrieved_documents[question_id] = {
            "question": question_data['question'], 
            "retrieved_documents": question_documents  
        }
    return retrieved_documents

In [20]:
preprocess_questions(
    questions=questions_dev,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

preprocess_questions(
    questions=questions_train,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

preprocess_questions(
    questions=questions_test,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

retrieved_documents_train = get_documents_from_elasticsearch(questions_train, 10, "medqa-chunks-100-final")
retrieved_documents_val = get_documents_from_elasticsearch(questions_dev, 10, "medqa-chunks-100-final")
retrieved_documents_test = get_documents_from_elasticsearch(questions_test, 10, "medqa-chunks-100-final")

save_data(retrieved_documents_train_stemmed, "final_es_retrieved_documents_train_chunks_100.pickle")
save_data(retrieved_documents_dev_stemmed, "final_es_retrieved_documents_val_chunks_100.pickle")
save_data(retrieved_documents_test, "final_es_retrieved_documents_test_chunks_100.pickle")


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


1272


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


10178


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


1272


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


10178


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))




## Running IR-ES

In [90]:
def ir_es(questions, no_documents_to_retrieve, index_name, metamap=False, all_questions_bank=False):
    start_time = time.time()

    correct_answer = 0
    incorrect_answer = 0
    for question_data in tqdm(questions):
        question = question_data['question']
        # for all_questions, the answer is the letter
        
        if all_questions_bank:
            answer = question_data['options'][question_data['answer']]
        else:
            answer = question_data['answer']

        final_answer = None
        final_score = 0

        for option, option_answer in question_data['options'].items():
            if metamap:
                query = ' '.join(question_data['metamap_phrases']) + " " + option_answer
            else:
                query = question + " " + option_answer
            top_documents = search_documents(query, no_documents_to_retrieve, index_name)
            if top_documents != []:
                score = 0
                for doc in top_documents:
                    score += doc['score']

                if final_score < score:
                    final_answer = option_answer
                    final_score = score

        correct = False
        if final_answer == answer:
            correct_answer += 1
            correct = True
        else:
            incorrect_answer += 1


    print(f'Accuracy: {100 * correct_answer / (correct_answer + incorrect_answer)}%')
    print(f'\tCorrect answers: {correct_answer}')
    print(f'\tInorrect answers: {incorrect_answer}')

In [91]:
# ir_es([x], 5, 'unprocessed')
def run_ir_es(questions, used_index, num_of_documents, metamap=False):
    print(f'Used index: {used_index}\nNumber of retrieved documents: {num_of_documents}\nUsing metamap phrases: {metamap}')
    ir_es(questions, num_of_documents, used_index, metamap)

## On stemmed index

In [92]:
run_ir_es(questions=questions_dev,
          used_index=Indexes.MedQA_stemmed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-stemmed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


Accuracy: 27.90880503144654%
	Correct answers: 355
	Inorrect answers: 917


In [93]:
run_ir_es(questions=questions_train,
          used_index=Indexes.MedQA_stemmed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-stemmed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


Accuracy: 27.608567498526234%
	Correct answers: 2810
	Inorrect answers: 7368


## On unstemmed index

In [106]:
run_ir_es(questions=questions_dev,
          used_index=Indexes.MedQA_unprocessed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-unprocessed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


Accuracy: 31.761006289308177%
	Correct answers: 404
	Inorrect answers: 868


In [105]:
run_ir_es(questions=questions_train,
          used_index=Indexes.MedQA_unprocessed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-unprocessed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


Accuracy: 30.978581253684418%
	Correct answers: 3153
	Inorrect answers: 7025


# ElasticSearch usage

## Creating a document

In [22]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

## Getting a document

In [25]:
res = es.get(index="medqa-chunks-100-final", id=2)
print(res['_source'])

{'name': 'Neurology_Adams1', 'content': 'encounter . the originators of this book , raymond d. adams and maurice victor , insisted that the basis of the practice of neurology necessarily differs from that of neuroscience in that neurology is a medical discipline and must always be related back to the patient . here is the story : a 19-year-old college sophomore began to show paranoid traits . she became convinced that her roommate was listening in on her phone conversations and planning to alter her essays . she became reclusive and spent most of her time locked in her room . after much difficulty ,', 'content_stemmed': 'encount . the origin of this book , raymond d. adam and mauric victor , insist that the basi of the practic of neurolog necessarili differ from that of neurosci in that neurolog is a medic disciplin and must alway be relat back to the patient . here is the stori : a 19-year-old colleg sophomor began to show paranoid trait . she becam convinc that her roommat was listen

## Refreshing index

In [None]:
es.indices.refresh(index="unprocessed")

## Searching for a document

In [None]:
res = es.search(index="test-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

## Deleting a document/index

In [69]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

# delete document
# es.delete(index="test-index", id=1)
# delete index
es.indices.delete(index=Indexes.MedQA_stemmed_chunks.value)

{'acknowledged': True}

In [None]:
# curl "localhost:9200/_cat/indices?v=true"
# curl -X GET "localhost:9200/_cat/health?v=true&pretty"
# curl -X GET "localhost:9200/sentences-stemmed/_settings"
