In [1]:
import os
import json
import nltk 
import numpy as np
import string
import time

from IPython.display import clear_output
from nltk.corpus import stopwords 
# reason for using snowball: https://stackoverflow.com/questions/10554052/what-are-the-major-differences-and-benefits-of-porter-and-lancaster-stemming-alg
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/gustaw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gustaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
questions_data_path = '../../data/medqa/questions/US_qbank.jsonl'
dev_questions_data_path = '../../data/medqa/questions/dev.jsonl'
textbooks_data_dir = '../../data/medqa/textbooks/'

questions_dev_medqa_path = '../../data/medqa/questions/4_options/dev.jsonl'
questions_train_medqa_path ='../../data/medqa/questions/4_options/train.jsonl'
questions_test_medqa_path ='../../data/medqa/questions/4_options/test.jsonl'

In [3]:
import pickle
def save_data(data, file_path):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        return pickle.load(handle)

In [4]:
questions_dev = []
questions_train = []

with open(questions_dev_medqa_path, 'r') as file:
    for line in file:
        questions_dev.append(json.loads(line))

with open(questions_train_medqa_path, 'r') as file:
    for line in file:
        questions_train.append(json.loads(line))
        
# with open(questions_test_medqa_path, 'r') as file:
#     for line in file:
#         questions_metamap_data.append(json.loads(line))    

corpus = {}
for textbook_name in os.listdir(textbooks_data_dir):
    textbook_path = textbooks_data_dir + '/' + textbook_name
    with open(textbook_path, 'r') as textbook_file:
        textbook_content = textbook_file.read()
        corpus[textbook_name] = textbook_content

In [10]:
stop_words = stopwords.words('english')
snowball_stemmer = SnowballStemmer(language='english') 
# do not remove the '-' and '/'
custom_string_punctuation = string.punctuation.replace('-','').replace('/','').replace('.','')
punctuation = str.maketrans('', '', custom_string_punctuation)

In [5]:
def preprocess_content(content, remove_stopwords, stemming, remove_punctuation):
    if not remove_stopwords and not stemming and not remove_punctuation:
        return content.lower().strip()
    if remove_punctuation:
        content = content.translate(punctuation).replace('“','').replace('’','')
    sentences = nltk.sent_tokenize(content.lower().strip())
    cleaned_sentences = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        if remove_stopwords:
            tokens = [x for x in tokens if x not in stop_words]
        if stemming:
            tokens = [snowball_stemmer.stem(x) for x in tokens]
        cleaned_sentences.append(' '.join(tokens))
            
    return ' '.join(cleaned_sentences)

def preprocess_corpus(corpus, remove_stopwords, stemming, remove_punctuation):
    for name, content in tqdm(corpus.items()):
        # TODO: removal of non-medical terms using MetaMap
        corpus[name] = preprocess_content(content, remove_stopwords, stemming, remove_punctuation)
        
        
def preprocess_questions(questions, remove_stopwords, stemming, remove_punctuation, metamap=False):    
    for question in tqdm(questions):
        x = preprocess_content(question['question'], remove_stopwords, stemming, remove_punctuation)
        question['question'] = x 
        for option, value in question['options'].items():
            question['options'][option] = preprocess_content(value, remove_stopwords, stemming, remove_punctuation)
        if metamap:
            question['answer'] = preprocess_content(question['answer'], remove_stopwords, stemming, remove_punctuation)
            for i, phrase in enumerate(question['metamap_phrases']):
                question['metamap_phrases'][i] = preprocess_content(phrase, remove_stopwords, stemming, remove_punctuation)

In [6]:
def preprocess_data(remove_stopwords, stemming, remove_punctuation, metamap):
    preprocess_corpus(
        corpus=corpus,
        remove_stopwords=remove_stopwords,
        stemming=stemming,
        remove_punctuation=remove_punctuation
    )
    preprocess_questions(
        questions=questions_metamap_data,
        remove_stopwords=remove_stopwords,
        stemming=stemming,
        remove_punctuation=remove_punctuation,
        metamap=metamap
    )

In [15]:
preprocess_questions(
    questions=questions_dev,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

preprocess_questions(
    questions=questions_train,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

100%|██████████| 1272/1272 [00:08<00:00, 153.90it/s]
100%|██████████| 10178/10178 [01:06<00:00, 152.99it/s]


In [70]:
preprocess_corpus(
        corpus=corpus,
        remove_stopwords=False,
        stemming=True,
        remove_punctuation=False
    )

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




In [7]:
def create_corpus_chunks(chunk_length):
    corpus_chunks = []
    for title, content in tqdm(corpus.items()):

        content_tokens = word_tokenize(content)

        counter = 0
        for i in range(0, len(content_tokens), chunk_length):
            chunk_name = title + str(counter)
            chunk = ' '.join(content_tokens[i:i+chunk_length])
            chunk_processed = preprocess_content(chunk, False, False, False)
            stemmed_chunk_processed = preprocess_content(chunk, False, True, False)
            entry = {
                'name': chunk_name,
                'content': chunk_processed,
                'content_stemmed': stemmed_chunk_processed
            }
            corpus_chunks.append(entry)
            counter += 1
    
    return corpus_chunks

In [8]:
chunk_length = 100
corpus_chunks_100 = create_corpus_chunks(chunk_length)

HBox(children=(FloatProgress(value=0.0, max=18.0), HTML(value='')))




NameError: name 'snowball_stemmer' is not defined

In [None]:
chunk_length = 50
corpus_chunks_50 = create_corpus_chunks(chunk_length)

In [None]:
def create_corpus_sentences():
    corpus_sentences = []
    for title, content in tqdm(corpus.items()):
        content_sentences = (nltk.sent_tokenize(content))
        sentence_counter = 0
        for sentence in content_sentences:
            corpus_sentences.append({
                'name': title + str(sentence_counter),
                'content': sentence
            })
    return corpus_sentences

In [None]:
corpus_sentences = create_corpus_sentences()

In [37]:
corpus_chunks[0]

{'name': 'Neurology_Adams.txt0',
 'content': 'We are very pleased to bring you the 11th edition of Adams and Victor ’ s Principles of Neurology . To provide the context for the continued importance and relevance of a textbook that aspires to such breadth and depth , it may be compelling to review a patient ’ s story ; an event that took place between the last edition of this book and this one . Neurologists have always been particularly attracted to the case history as a method to imprint the fine points as well as the broad principles that can be gleaned in a clinical'}

In [16]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

def upload_documents(documents, index_name):
    create_index_body = """{
        "settings": {
            "index": {
                "similarity": {
                    "default": {
                        "type": "BM25"
                    }
                }
            }
        }
    }"""
    
    # TODO: try to create index in the same manner as MedQA: https://github.com/jind11/MedQA
    
#     create_index_body = """{
#         "settings": {
#             "number_of_shards": 1,
#             "number_of_replicas": 0
#           },
#         "mappings": {
#             "chunk": {
#               "dynamic": "false",
#               "properties": {
#                 "name": {
#                   "type": "string"
#                 },
#                 "content": {
#                   "analyzer": "snowball",
#                   "type": "string",
#                   "similarity":"BM25"
#                 }
#               }
#             }
#         }
#     }"""
    print(create_index_body)
    es.indices.create(index=index_name, body=create_index_body)

    id_counter = 1
    for document in tqdm(documents):
        res = es.index(index=index_name, id=id_counter, body=document)
        id_counter += 1

In [11]:
from enum import Enum

class Indexes(Enum):
    Unprocessed_chunks_100 = "unprocessed-chunks-100",
    Unprocessed_sentences = "sentences-unprocessed-shards-1",
    Stemmed_sentences = "sentences-stemmed-shards-1"
    MedQA_stemmed_chunks = "medqa-stemmed-chunks",
    MedQA_unprocessed_chunks = "medqa-unprocessed-chunks",
    MedQA_chunks_100 = "medqa-chunks-100"
    MedQA_chunks_50 = "medqa-chunks-50"
    #stemming-punctuation

In [26]:
# upload_documents(corpus_chunks_100, Indexes.MedQA_chunks_100.value)
upload_documents(corpus_chunks_50, Indexes.MedQA_chunks_50.value)

{
        "settings": {
            "index": {
                "similarity": {
                    "default": {
                        "type": "BM25"
                    }
                }
            }
        }
    }


HBox(children=(FloatProgress(value=0.0, max=303096.0), HTML(value='')))




In [77]:
# upload_documents(corpus_sentences, Indexes.Unprocessed_sentences.value)

In [12]:
def search_documents(query_input, n, index_name):
    res = es.search(
        index=index_name, 
        body={
            "query": {
                "match": {
                    "content": query_input
                }
            },
            "from": 0,
            "size": n
        }
    )
    
    number_of_hits = len(res['hits']['hits'])
    
    results = []
    for i in range(number_of_hits):
        score = res['hits']['hits'][i]['_score']
        paragraph = res['hits']['hits'][i]['_source']
        result = {
            "score": score,
            "evidence": paragraph
        }
        results.append(result)
        
    return results

## Creating the collection of question: list of lists of documents retrieved per each option answer

In [20]:
def get_documents_from_elasticsearch(questions, num_of_docs, index):
    retrieved_documents = {}
    print(len(questions))
    for idx, question_data in enumerate(tqdm(questions)):
        question_documents = {}
        question_id = "q" + str(idx)
        for option, option_answer in question_data['options'].items():
            query = ' '.join(question_data['metamap_phrases']) + ' ' + option_answer
            top_documents = search_documents(query, num_of_docs, index)
            question_documents[option_answer] = top_documents

        retrieved_documents[question_id] = {
            "question": question_data['question'], 
            "retrieved_documents": question_documents  
        }
    return retrieved_documents

# preprocess_questions(
#     questions=questions_dev,
#     remove_stopwords=False,
#     stemming=False,
#     remove_punctuation=False,
#     metamap=True
# )

# preprocess_questions(
#     questions=questions_train,
#     remove_stopwords=False,
#     stemming=False,
#     remove_punctuation=False,
#     metamap=True
# )

# # 100: unstemmed
# retrieved_documents_dev_unprocessed = get_documents_from_elasticsearch(questions_dev, 10, Indexes.MedQA_chunks_100.value)
# retrieved_documents_train_unprocessed = get_documents_from_elasticsearch(questions_train, 10, Indexes.MedQA_chunks_100.value)

# save_data(retrieved_documents_dev_unprocessed, "es_retrieved_documents_val_chunks_100_questions_unprocessed.pickle")
# save_data(retrieved_documents_train_unprocessed, "es_retrieved_documents_train_chunks_100_questions_unprocessed.pickle")

# # 50: unstemmed
# retrieved_documents_dev_unprocessed = get_documents_from_elasticsearch(questions_dev, 20, Indexes.MedQA_chunks_50.value)
# retrieved_documents_train_unprocessed = get_documents_from_elasticsearch(questions_train, 20, Indexes.MedQA_chunks_50.value)

# save_data(retrieved_documents_dev_unprocessed, "es_retrieved_documents_val_chunks_50_questions_unprocessed.pickle")
# save_data(retrieved_documents_train_unprocessed, "es_retrieved_documents_train_chunks_50_questions_unprocessed.pickle")


preprocess_questions(
    questions=questions_dev,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

preprocess_questions(
    questions=questions_train,
    remove_stopwords=False,
    stemming=True,
    remove_punctuation=False,
    metamap=True
)

retrieved_documents_dev_stemmed = get_documents_from_elasticsearch(questions_dev, 10, Indexes.MedQA_chunks_100.value)
retrieved_documents_train_stemmed = get_documents_from_elasticsearch(questions_train, 10, Indexes.MedQA_chunks_100.value)

save_data(retrieved_documents_dev_stemmed, "es_retrieved_documents_val_chunks_100_questions_stemmed.pickle")
save_data(retrieved_documents_train_stemmed, "es_retrieved_documents_train_chunks_100_questions_stemmed.pickle")

# 50: stemmed
retrieved_documents_dev_unprocessed = get_documents_from_elasticsearch(questions_dev, 20, Indexes.MedQA_chunks_50.value)
retrieved_documents_train_unprocessed = get_documents_from_elasticsearch(questions_train, 20, Indexes.MedQA_chunks_50.value)

save_data(retrieved_documents_dev_unprocessed, "es_retrieved_documents_val_chunks_50_questions_stemmed.pickle")
save_data(retrieved_documents_train_unprocessed, "es_retrieved_documents_train_chunks_50_questions_stemmed.pickle")


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


1272


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


10178


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


1272


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


10178


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))




In [44]:
retrieved_documents_train_unprocessed["q0"]

{'question': 'a 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. she states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. she otherwise feels well and is followed by a doctor for her pregnancy. her temperature is 97.7°f (36.5°c), blood pressure is 122/77 mmhg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air. physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. which of the following is the best treatment for this patient?',
 'retrieved_documents': {'ampicillin': [{'score': 44.465958,
    'evidence': {'name': 'Obstentrics_Williams.txt0',
     'content': 'surveillance for worsening sepsis syndrome includes serial monitoring of urinary output, blood pressure, pulse, temperature, and oxygen saturation.'}},
   {'score': 41.05628,
    'evidence': {'name': 'Pharmacology_Katzung.txt0',
     'content': 'oxygen saturation by f

In [30]:
save_data(retrieved_documents_dev_stemmed, "es_retrieved_documents_dev_stemmed.pickle")
save_data(retrieved_documents_train_stemmed, "es_retrieved_documents_train_stemmed.pickle")


In [45]:
test = load_pickle("es_retrieved_documents_dev_stemmed.pickle")
test["q0"]

{'question': 'a 21-year-old sexual activ male complain of fever , pain dure urin , and inflamm and pain in the right knee . a cultur of the joint fluid show a bacteria that doe not ferment maltos and has no polysaccharid capsul . the physician order antibiot therapi for the patient . the mechan of action of action of the medic given block cell wall synthesi , which of the follow was given ?',
 'retrieved_documents': {'chloramphenicol': [{'score': 38.640953,
    'evidence': {'name': 'Biochemistry_Lippincott.txt0',
     'content': '2.2. a 42-year-old male patient undergoing radiation therapy for prostate cancer develops severe pain in the metatarsal phalangeal joint of his right big toe.'}},
   {'score': 34.522305,
    'evidence': {'name': 'Anatomy_Gray.txt0',
     'content': 'a 45-year-old man came to his physician complaining of pain and weakness in his right shoulder.'}},
   {'score': 33.64039,
    'evidence': {'name': 'First_Aid_Step2.txt0',
     'content': 'an active 13-year-old boy

## Running IR-ES

In [90]:
def ir_es(questions, no_documents_to_retrieve, index_name, metamap=False, all_questions_bank=False):
    start_time = time.time()

    correct_answer = 0
    incorrect_answer = 0
    for question_data in tqdm(questions):
        question = question_data['question']
        # for all_questions, the answer is the letter
        
        if all_questions_bank:
            answer = question_data['options'][question_data['answer']]
        else:
            answer = question_data['answer']

        final_answer = None
        final_score = 0

        for option, option_answer in question_data['options'].items():
            if metamap:
                query = ' '.join(question_data['metamap_phrases']) + " " + option_answer
            else:
                query = question + " " + option_answer
            top_documents = search_documents(query, no_documents_to_retrieve, index_name)
            if top_documents != []:
                score = 0
                for doc in top_documents:
                    score += doc['score']

                if final_score < score:
                    final_answer = option_answer
                    final_score = score

        correct = False
        if final_answer == answer:
            correct_answer += 1
            correct = True
        else:
            incorrect_answer += 1


    print(f'Accuracy: {100 * correct_answer / (correct_answer + incorrect_answer)}%')
    print(f'\tCorrect answers: {correct_answer}')
    print(f'\tInorrect answers: {incorrect_answer}')

In [91]:
# ir_es([x], 5, 'unprocessed')
def run_ir_es(questions, used_index, num_of_documents, metamap=False):
    print(f'Used index: {used_index}\nNumber of retrieved documents: {num_of_documents}\nUsing metamap phrases: {metamap}')
    ir_es(questions, num_of_documents, used_index, metamap)

## On stemmed index

In [92]:
run_ir_es(questions=questions_dev,
          used_index=Indexes.MedQA_stemmed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-stemmed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


Accuracy: 27.90880503144654%
	Correct answers: 355
	Inorrect answers: 917


In [93]:
run_ir_es(questions=questions_train,
          used_index=Indexes.MedQA_stemmed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-stemmed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


Accuracy: 27.608567498526234%
	Correct answers: 2810
	Inorrect answers: 7368


## On unstemmed index

In [106]:
run_ir_es(questions=questions_dev,
          used_index=Indexes.MedQA_unprocessed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-unprocessed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=1272.0), HTML(value='')))


Accuracy: 31.761006289308177%
	Correct answers: 404
	Inorrect answers: 868


In [105]:
run_ir_es(questions=questions_train,
          used_index=Indexes.MedQA_unprocessed_chunks.value,
          num_of_documents=10,
          metamap=True
         )

Used index: medqa-unprocessed-chunks
Number of retrieved documents: 10
Using metamap phrases: True


HBox(children=(FloatProgress(value=0.0, max=10178.0), HTML(value='')))


Accuracy: 30.978581253684418%
	Correct answers: 3153
	Inorrect answers: 7025


In [None]:
from nltk import ngrams, FreqDist

grams = ngrams('one two scy blue one two babaloo'.split(), 1)
freq = [{key:value for nltk.FreqDist(grams)]
freq

In [None]:
x = freq.get(('lol',))

In [None]:
from nltk import ngrams, FreqDist

corpus_unigrams = ngrams(corpus_joined.split(), 1)
corpus_unigrams_freq = nltk.FreqDist(corpus_unigrams)
def calculate_score(bm25_score, query):
    query_unigrams = ngrams(query.split(), 1)
    
    for unigram in query_unigrams:
        
    
    query_unigrams_freq = FreqDist(query_unigrams)

In [None]:
dev_questions = []

with open(questions_dev_medqa_path, 'r') as file:
    for line in file:
        dev_questions.append(json.loads(line))

In [None]:
dev_questions[0]

In [None]:
import copy

x = copy.deepcopy(dev_questions)

In [None]:
x[1]['question'] = 'lol'

In [None]:
dev_questions[0]['question']

In [None]:
snowball_stemmer = SnowballStemmer(language='english') 
stem_questions(x[:15], snowball_stemmer)

In [None]:
x[5]

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [None]:
import torch

text = r"""🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch."""

question = "How many pretrained models are available in 🤗 Transformers?"

inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"].tolist()[0]

text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits

answer_start = torch.argmax(answer_start_scores)  # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(f"Question: {question}")
print(f"Answer: {answer}")

# print(len(inputs['input_ids'][0]))
# print(len(inputs['token_type_ids'][0]))
# print(len(inputs['attention_mask'][0]))

## Experiment: using BERT reader with the question from medqa and documents from ir_es retriever as evidence

In [None]:
# corpus_joined = ' '.join(list(corpus.values()))
chlamydia_question = '''A 27-year-old male presents to urgent care complaining of pain with urination. He reports that the pain started 3 days ago. He has never experienced these symptoms before. He denies
gross hematuria or pelvic pain. He is sexually active with his girlfriend, and they consistently use condoms. When asked about recent travel, he admits to recently returning from a
boys’ trip” in Cancun where he had unprotected sex 1 night with a girl he met at a bar. The patients medical history includes type I diabetes that is controlled with an insulin pump.
His mother has rheumatoid arthritis. The patients temperature is 99 F (37.2 C), blood pressure is 112/74 mmHg, and pulse is 81/min. On physical examination, there are no lesions of
the penis or other body rashes. No costovertebral tenderness is appreciated. A urinalysis reveals no blood, glucose, ketones, or proteins but is positive for leukocyte esterase. A urine
microscopic evaluation shows a moderate number of white blood cells but no casts or crystals. A urine culture is negative. Which of the following is the most likely cause for the
patient’s symptoms?'''

another_question = '''a 4670-g  10-lb 5-oz  male newborn is delivered at term to a 26-year-old woman after prolonged labor  apgar scores are 9 and 9 at 1 and 5 minutes  examination in the delivery room shows swelling  tenderness  and crepitus over the left clavicle  there is decreased movement of the left upper extremity  movement of the hands and wrists are normal  a grasping reflex is normal in both hands  an asymmetric moro reflex is present  the remainder of the examination shows no abnormalities and an anteroposterior x-ray confirms the diagnosis  which of the following is the most appropriate next step in management'''
# x_text = "Last Monday Mark started working on my thesis using BERT. So fat he was stuck on it and could not progress"
# x_question =  'What did Mark start on Sunday?''
# retrieved_documents = search_documents(query_input=another_question, n=5, index_name='unprocessed')
retrieved_documents = search_documents(query_input=x['question'], n=10, index_name='stemming-punctuation')

In [None]:
evidence = ' '.join([x['evidence']['content'] for x in retrieved_documents[:2]])

In [None]:
evidence

In [None]:
inputs = tokenizer(x['question'], evidence, 
                   add_special_tokens=True, 
                   return_tensors="pt"
                  )

input_ids = inputs["input_ids"].tolist()[0]

# print(len(inputs['input_ids'][0]))
# print(len(inputs['token_type_ids'][0]))
# print(len(inputs['attention_mask'][0]))

text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
outputs = model(**inputs)
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits

answer_start = torch.argmax(answer_start_scores)  # Get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

print(f"Question: {x['question']}")
print(f"\nAnswer: {answer}")

In [None]:
def ir_es_bert(questions, no_documents_to_retrieve, index_name, metamap=False):
    start_time = time.time()
    for i, question_data in enumerate(questions):
        question = question_data['question']
        if metamap:
            query = ' '.join(question_data['metamap_phrases'])
        else:
            query = question_data['question']
        answer = question_data['answer']
        
        # obtain the top-N ranked passages from the large-scale document collection C
        retrieved_documents = search_documents(
            query_input=query,
            n=no_documents_to_retrieve,
            index_name=index_name
        )
        
        # contatenating them into a long sequence c
        c = ""
        for document in retrieved_documents:
            c += document['evidence']['content']
        for option, option_answer in question_data['options'].items():
            # then for each question and option pair qa_i = q + a_i
            question_answer = question + " " + option_answer
            print(question_answer)
            # qa_i and c are then passed to the document reader for reasoning and decision making
            inputs = tokenizer(question_answer, c, 
                   add_special_tokens=True, 
                   return_tensors="pt"
                  )

            input_ids = inputs["input_ids"].tolist()[0]

            # print(len(inputs['input_ids'][0]))
            # print(len(inputs['token_type_ids'][0]))
            # print(len(inputs['attention_mask'][0]))

            text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
            print(text_tokens)
            outputs = model(**inputs)
            answer_start_scores = outputs.start_logits
            answer_end_scores = outputs.end_logits

            answer_start = torch.argmax(answer_start_scores)  # Get the most likely beginning of answer with the argmax of the score
            answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score

            answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))

            print(f"Question: {question}")
            print(f"\nAnswer: {answer}")
            
        break

In [None]:
ir_es_bert(dev_questions_data, 2, 'stemming-punctuation', True)

In [None]:
conda install -c huggingface transformers

In [None]:
python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"

# ElasticSearch usage

## Creating a document

In [None]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

doc = {
    'author': 'author_name',
    'text': 'Interensting content...',
    'title': 'Test Title',
    'timestamp': datetime.now(),
}
res = es.index(index="test-index", id=1, body=doc)
print(res['result'])

## Getting a document

In [None]:
res = es.get(index="test-index", id=1)
print(res['_source'])

## Refreshing index

In [None]:
es.indices.refresh(index="unprocessed")

## Searching for a document

In [None]:
res = es.search(index="test-index", body={"query": {"match_all": {}}})
print("Got %d Hits:" % res['hits']['total']['value'])
for hit in res['hits']['hits']:
    print("%(timestamp)s %(author)s: %(text)s" % hit["_source"])

## Deleting a document/index

In [69]:
from datetime import datetime
from elasticsearch import Elasticsearch
es = Elasticsearch()

# delete document
# es.delete(index="test-index", id=1)
# delete index
es.indices.delete(index=Indexes.MedQA_stemmed_chunks.value)

{'acknowledged': True}

In [None]:
# curl "localhost:9200/_cat/indices?v=true"
# curl -X GET "localhost:9200/_cat/health?v=true&pretty"
# curl -X GET "localhost:9200/sentences-stemmed/_settings"
