In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/dense-index-retrieval/'

Mounted at /content/drive
/content/drive/My Drive/dense-index-retrieval


## Packages and imports

In [2]:
!pip install transformers
!pip install pickle5

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 19.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 52.2MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
Collecting pickle5
[?25l  Downloading https://files.pythonhosted

In [3]:
import json
import numpy as np
import pickle5 as pickle
import nltk
import random
import torch

from datetime import datetime
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer
nltk.download('punkt')
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader, SequentialSampler, TensorDataset
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, AutoModelForMultipleChoice, get_linear_schedule_with_warmup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Pickle methods and content processing methods

In [4]:
def save_data(data, file_path):
    with open(file_path, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_pickle(file_path):
    with open(file_path, 'rb') as handle:
        return pickle.load(handle)

In [49]:
# def letter_answer_to_index(answer):
#   distribution = [0.0] * 5
#   distribution[ord(answer) - 65] = 1.0
#   return distribution

def letter_answer_to_index(answer):
  return ord(answer) - 65

In [50]:
def preprocess_content(content, remove_stopwords, stemming, remove_punctuation):
    if not remove_stopwords and not stemming and not remove_punctuation:
        return content.lower()
    if remove_punctuation:
        content = content.translate(punctuation).replace('“','').replace('’','')
    snowball_stemmer = SnowballStemmer(language='english') 
    sentences = nltk.sent_tokenize(content.lower())
    cleaned_sentences = []
    
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        if remove_stopwords:
            tokens = [x for x in tokens if x not in stop_words]
        if stemming:
            tokens = [snowball_stemmer.stem(x) for x in tokens]
        cleaned_sentences.append(' '.join(tokens))
            
    return ' '.join(cleaned_sentences)

def preprocess_questions(questions, remove_stopwords, stemming, remove_punctuation, metamap=False):    
    for question_id, question in tqdm(questions.items()):
        processed_question = {"question": "", "options": {"A":"", "B":"","C":"","D":"","E":""}, "answer": "", "metamap_phrases": [], "answer_idx": ""}
        processed_question['answer_idx'] = question["answer_idx"]
        processed_question['question'] = preprocess_content(question['question'], remove_stopwords, stemming, remove_punctuation)
        for option, value in question['options'].items():
            processed_question['options'][option] = preprocess_content(value, remove_stopwords, stemming, remove_punctuation)
        if metamap:
            processed_question['answer'] = preprocess_content(question['answer'], remove_stopwords, stemming, remove_punctuation)
            for i, phrase in enumerate(question['metamap_phrases']):
                processed_question['metamap_phrases'].append(preprocess_content(phrase, remove_stopwords, stemming, remove_punctuation))
        questions[question_id] = processed_question

def get_context(question_id, option, documents_collection=None):
    try:
      result = [x['evidence']['content'] for x in documents_collection[question_id]['retrieved_documents'][option.strip()]]
      # if question == "a 65-year-old asian man presents to his primary care physician because of abdominal distension, right upper quadrant (ruq) abdominal pain, decreased appetite, and weight loss for several weeks. he denies smoking or excess alcohol intake. his temperature is 37.1°c (98.7°f), blood pressure is 120/80 mm hg, and pulse is 85/min. physical examination reveals a cachectic man with jaundice, palmar erythema, ascites, and a palpable mass in the ruq. abdominal ultrasound shows a 3 cm hypoechoic mass in the right lobe of the liver. alpha fetoprotein (afp) is 500 μg/l. which of the following is a risk factor for this patient condition?":
      #   print('nice')
    except:
      print(f"Question ID: {question_id}")
      print(question_id in documents_collection.keys())

      print(f"Option: {option}s")
      print(f"Option: {option.strip()}s")
      
      print(option in documents_collection[question_id]['retrieved_documents'].keys())
      print(f"Options: {documents_collection[question_id]['retrieved_documents'].keys()}")
    if result == [] or result is None:
      print(f"Empty result for question: {question}")
      return "No content"
    return ' '.join(result) 

## Loading data

In [7]:
questions_dev_medqa_path = 'data/medqa/questions/metamap_extracted_phrases/dev.jsonl'
questions_train_medqa_path ='data/medqa/questions/metamap_extracted_phrases/train.jsonl'
questions_test_medqa_path ='data/medqa/questions/metamap_extracted_phrases/train.jsonl'

# train
es_retrieved_documents_train_stemmed_path = 'data/es_retrieved_documents_train_stemmed.pickle'
es_retrieved_documents_train_unprocessed_path = 'data/es_retrieved_documents_train_unprocessed.pickle'

# dev
es_retrieved_documents_dev_stemmed_path = 'data/es_retrieved_documents_dev_stemmed.pickle'
es_retrieved_documents_dev_unprocessed_path = 'data/es_retrieved_documents_dev_unprocessed.pickle'

### Train

In [8]:
questions_data_train_stemmed = {}
questions_data_train_unprocessed = {}

with open(questions_train_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_train_stemmed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_train_stemmed, False, True, False, True)

with open(questions_train_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_train_unprocessed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_train_unprocessed, False, False, False, True)

es_retrieved_documents_train_stemmed = load_pickle(es_retrieved_documents_train_stemmed_path)        
es_retrieved_documents_train_unprocessed = load_pickle(es_retrieved_documents_train_unprocessed_path)        

100%|██████████| 10178/10178 [01:18<00:00, 130.23it/s]
100%|██████████| 10178/10178 [00:00<00:00, 52567.09it/s]


### Dev

In [22]:
questions_data_dev_stemmed = {}
questions_data_dev_unprocessed = {}

with open(questions_dev_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_dev_stemmed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_dev_stemmed, False, True, False, True)

with open(questions_dev_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_dev_unprocessed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_dev_unprocessed, False, False, False, True)

es_retrieved_documents_dev_stemmed = load_pickle(es_retrieved_documents_dev_stemmed_path)        
es_retrieved_documents_dev_unprocessed = load_pickle(es_retrieved_documents_dev_unprocessed_path)        

100%|██████████| 1272/1272 [00:09<00:00, 133.29it/s]
100%|██████████| 1272/1272 [00:00<00:00, 56641.27it/s]


In [23]:
# sanity check
es_retrieved_documents_dev_unprocessed["q0"]['question'] == questions_data_dev_unprocessed['q0']['question']

True

In [33]:
questions_data_train_stemmed = {}
questions_data_train_unprocessed = {}

with open(questions_train_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_train_stemmed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_train_stemmed, False, True, False, True)

with open(questions_train_medqa_path, 'r') as file:
    for idx, line in enumerate(file):
        questions_data_train_unprocessed[f"q{idx}"] = json.loads(line)

preprocess_questions(questions_data_train_unprocessed, False, False, False, True)

es_retrieved_documents_train_stemmed = load_pickle(es_retrieved_documents_train_stemmed_path)        
es_retrieved_documents_train_unprocessed = load_pickle(es_retrieved_documents_train_unprocessed_path)        

100%|██████████| 10178/10178 [01:16<00:00, 132.72it/s]
100%|██████████| 10178/10178 [00:00<00:00, 52933.40it/s]


In [34]:
# sanity check
es_retrieved_documents_train_unprocessed["q0"]['question'] == questions_data_train_unprocessed['q0']['question']

True

## Model definition and freezing layers

In [69]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("Using {} device".format(device))

class CustomBERT(torch.nn.Module):
    def __init__(self):
          super(CustomBERT, self).__init__()
          self.bert = AutoModel.from_pretrained('bert-base-cased') 
          self.linear = torch.nn.Linear(self.bert.pooler.dense.out_features, 1)
          self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
          # equivalent of having self.bert(**inputs)
          # bert_output = self.bert(input_ids=input_ids, 
          #                         attention_mask=attention_mask, 
          #                         token_type_ids=token_type_ids)
          # linear_output = self.linear(bert_output.last_hidden_state[0][0])


          # TODO: return for each batch the output, not only for the one
          # reference: https://github.com/huggingface/transformers/pull/96/files
          # line 929 in pytorch_pretrained_bert/modeling.py
          # also: https://discuss.pytorch.org/t/solved-batching-process-of-torch-nn-linear/15986
          bert_output = self.bert(input_ids=input_ids, 
                                  attention_mask=attention_mask, 
                                  token_type_ids=token_type_ids)
          # TODO: add dropout          
          linear_output = self.linear(bert_output.pooler_output)
          return linear_output

model = CustomBERT().to(device)
model.cuda()
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Using cuda device


In [70]:
layers_to_not_freeze = ['linear', 'pooler'] # freezing first 11 layers

for name, param in model.named_parameters():
  if not any(x in name for x in layers_to_not_freeze):
    param.requires_grad = False
    # print(f"Set flag to {param.requires_grad} for {name}")

## Queries tokenization

In [51]:
def create_tokenized_input(questions_dict: dict, documents_collection_dict: dict):
  input_queries = []
  input_answers = []
  input_answers_idx = []

  for question_id, question_data in tqdm(questions_dict.items()):
    question = question_data['question']
    metamap_phrases = question_data['metamap_phrases']
    queries = []
    for option in question_data['options'].values():
      qa = ' '.join(metamap_phrases) + ' ' + option
      retrieved_documents = get_context(question_id=question_id, 
                                        option=option, 
                                        documents_collection=documents_collection_dict) 
      context = ' '.join(retrieved_documents)
      query = tokenizer(context, qa, 
                        add_special_tokens=True,
                        max_length = 512, 
                        padding='max_length',
                        truncation=True,
                        return_tensors="pt"
                        )
      query_input_ids = query["input_ids"].flatten()
      query_token_type_ids = query["token_type_ids"].flatten()
      query_attention_mask = query["attention_mask"].flatten()

      queries.append({
          "input_ids": query_input_ids,
          "token_type_ids": query_token_type_ids,
          "attention_mask": query_attention_mask
      })
    # break
    # dev_dataset_input.append({
    #       "correct_answer": question_data["answer"],
    #       "correct_answer_idx": letter_answer_to_index(question_data['answer_idx']),
    #       "queries": queries
    #   })
    input_queries.append(queries)
    input_answers.append(question_data["answer"])
    
    input_answers_idx.append(letter_answer_to_index(question_data['answer_idx']))
  return input_queries, input_answers, input_answers_idx

In [52]:
train_input_queries, train_input_answers, train_input_answers_idx = create_tokenized_input(questions_dict=questions_data_train_unprocessed,
                                                                                     documents_collection_dict=es_retrieved_documents_train_unprocessed)


dev_input_queries, dev_input_answers, dev_input_answers_idx = create_tokenized_input(questions_dict=questions_data_dev_unprocessed,
                                                                                     documents_collection_dict=es_retrieved_documents_dev_unprocessed)




100%|██████████| 10178/10178 [01:56<00:00, 87.00it/s]
100%|██████████| 1272/1272 [00:15<00:00, 80.61it/s]


## DataLoader creation

### MedQA dataset definition

In [15]:
from torch.utils.data import Dataset, DataLoader, SequentialSampler, TensorDataset

class MedQADataset(TensorDataset):
  def __init__(self, all_possible_queries, answers, answers_idx):
        self.all_possible_queries = all_possible_queries
        self.answers = answers
        self.answers_idx = answers_idx

  def __len__(self):
        'Denotes the total number of samples'
        return len(self.all_possible_queries)

  def __getitem__(self, index):
        'Generates one sample of data'
        queries = self.all_possible_queries[index]
        correct_answer = self.answers[index]
        correct_answer_idx = self.answers_idx[index]
        return queries, correct_answer, correct_answer_idx

batch_size = 32

### Train

In [55]:
train_dataset = MedQADataset(all_possible_queries=train_input_queries, 
                             answers=train_input_answers, 
                             answers_idx=train_input_answers_idx)
 
train_dataloader = DataLoader(dataset=train_dataset,
                              sampler = SequentialSampler(train_dataset), 
                              batch_size = batch_size)

### Dev dataloader

In [56]:
dev_dataset = MedQADataset(all_possible_queries=dev_input_queries, 
                           answers=dev_input_answers, 
                           answers_idx=dev_input_answers_idx)
 
dev_dataloader = DataLoader(dataset=dev_dataset, 
                            sampler = SequentialSampler(dev_dataset), 
                            batch_size = batch_size)

In [39]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
total_t0 = time.time()

## Training

In [59]:
x_output = x_output.detach().cpu().numpy()
x_answers = x_answers.to('cpu').numpy()

In [60]:
calculate_accuracy(x_output, x_answers)

0.3125

In [40]:
def calculate_accuracy(predictions_distribution, correct_answers):
  predictions = np.argmax(predictions_distribution, axis=1)
  return np.sum(np.argmax(x_output, axis=1) == x_answers) / len(correct_answers)

In [71]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

#  We fine-tune for 3 epochs with a learning rate of 5e-5 and a batch size of 32
num_epochs = 2
lr = 5e-5

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

total_steps = num_epochs * batch_size
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

for epoch in range(num_epochs):
  print(f'======== Epoch {epoch + 1} / {num_epochs} ========')
  t0 = time.time()
  total_train_loss = 0
  model.train()
  for step, batch in enumerate(train_dataloader):
    if step % 10 == 0 and not step == 0:
      elapsed = format_time(time.time() - t0)
      print(f'Batch {step} of {len(train_dataloader)}. Elapsed: {elapsed}')

    model.zero_grad()

    questions_queries_collection = batch[0]
    answers = batch[1]
    answers_indexes = batch[2]
    queries_outputs = []
    for question_queries in questions_queries_collection:
        input_ids = question_queries["input_ids"].to(device)
        input_token_type_ids = question_queries["token_type_ids"].to(device)        
        input_attention_mask = question_queries["attention_mask"].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        output = model(input_ids=input_ids, 
                        attention_mask=input_attention_mask,
                        token_type_ids=input_token_type_ids)
        queries_outputs.append(output)        
    # each row represents values for the same question, each column represents an output for an answer option
    queries_outputs = torch.stack(queries_outputs).reshape([5, len(answers)]).transpose(0, 1)
    # choosing the indexes of the answers with the highest post-softmax value
    output = model.softmax(queries_outputs)

    loss = criterion(output, answers_indexes.to(device))
    total_train_loss += loss
    loss.backward()
    # Clip the norm of the gradients to 1.0.
    # This is to help prevent the "exploding gradients" problem.
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Update parameters and take a step using the computed gradient.
    # The optimizer dictates the "update rule"--how the parameters are
    # modified based on their gradients, the learning rate, etc.
    optimizer.step()

    # Update the learning rate.
    scheduler.step()
  
  # Calculate the average loss over all of the batches.
  avg_train_loss = total_train_loss / len(train_dataloader)            
  
  # Measure how long this epoch took.
  training_time = format_time(time.time() - t0)

  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epoch took: {:}".format(training_time))

  # ========================================
  #               Validation
  # ========================================
  print("Running Validation...")

  t0 = time.time()

  # Put the model in evaluation mode--the dropout layers behave differently
  # during evaluation.
  model.eval()

  # Tracking variables 
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  # Evaluate data for one epoch
  for step, batch in enumerate(dev_dataloader):
    questions_queries_collection = batch[0]
    answers = batch[1]
    answers_indexes = batch[2]

    queries_outputs = []
    for question_queries in questions_queries_collection:
        input_ids = question_queries["input_ids"].to(device)
        input_token_type_ids = question_queries["token_type_ids"].to(device)        
        input_attention_mask = question_queries["attention_mask"].to(device)
        
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        output = model(input_ids=input_ids, 
                        attention_mask=input_attention_mask,
                        token_type_ids=input_token_type_ids)
        queries_outputs.append(output)        

    queries_outputs = torch.stack(queries_outputs).reshape([5, len(answers)]).transpose(0, 1)
    output = model.softmax(queries_outputs)
    loss = criterion(output, answers_indexes.to(device))
    
    # Accumulate the validation loss.
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    output = output.detach().cpu().numpy()
    answers_indexes = answers_indexes.to('cpu').numpy()

    total_eval_accuracy += calculate_accuracy(output, answers_indexes)


  # Report the final accuracy for this validation run.
  avg_val_accuracy = total_eval_accuracy / len(dev_dataloader)
  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

  # Calculate the average loss over all of the batches.
  avg_val_loss = total_eval_loss / len(dev_dataloader)
  
  # Measure how long the validation run took.
  validation_time = format_time(time.time() - t0)
  
  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))

  # Record all statistics from this epoch.
  training_stats.append(
      {
          'epoch': epoch + 1,
          'Training Loss': avg_train_loss,
          'Valid. Loss': avg_val_loss,
          'Valid. Accur.': avg_val_accuracy,
          'Training Time': training_time,
          'Validation Time': validation_time
      }
  )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

Batch 10 of 319. Elapsed: 0:01:03
Batch 20 of 319. Elapsed: 0:02:07
Batch 30 of 319. Elapsed: 0:03:12


KeyboardInterrupt: ignored

## Draft section: experimenting with python

tensor([[0.2091, 0.2025, 0.1957, 0.1948, 0.1979],
        [0.1837, 0.1944, 0.2211, 0.2106, 0.1902],
        [0.2182, 0.1957, 0.1901, 0.1960, 0.2001],
        [0.2092, 0.2014, 0.1885, 0.2007, 0.2002],
        [0.2030, 0.1914, 0.2045, 0.2073, 0.1937],
        [0.1910, 0.2003, 0.1922, 0.2009, 0.2155],
        [0.1840, 0.2092, 0.2006, 0.1975, 0.2086],
        [0.1979, 0.2019, 0.2062, 0.1933, 0.2007],
        [0.2099, 0.1965, 0.2123, 0.1943, 0.1869],
        [0.1985, 0.1923, 0.2042, 0.2049, 0.2001],
        [0.1968, 0.2112, 0.2066, 0.1919, 0.1936],
        [0.2060, 0.1895, 0.2041, 0.1951, 0.2053],
        [0.2045, 0.1961, 0.1932, 0.1994, 0.2068],
        [0.2029, 0.2034, 0.1962, 0.1895, 0.2081],
        [0.2074, 0.2016, 0.2009, 0.1991, 0.1910],
        [0.1936, 0.1956, 0.2074, 0.2093, 0.1942],
        [0.1827, 0.1978, 0.1970, 0.2142, 0.2084],
        [0.2005, 0.2027, 0.1969, 0.2050, 0.1949],
        [0.2088, 0.2007, 0.2026, 0.1941, 0.1938],
        [0.2012, 0.2046, 0.2078, 0.1994, 0.1871],


In [None]:
x_answers

tensor([3, 0, 4, 2, 0, 2, 4, 2, 0, 0, 0, 3, 1, 0, 1, 4, 2, 0, 1, 1, 3, 3, 0, 3,
        2, 3, 0, 3, 4, 3, 4, 0])

In [None]:
criterion(x_output, x_answers.to(device))

tensor(1.6107, device='cuda:0', grad_fn=<NllLossBackward>)

In [None]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)

print(input)
print(target)

tensor([[ 0.4740,  0.1978,  1.1561,  0.3965, -2.4661],
        [ 0.3623,  0.3765, -0.1808,  0.3930,  0.4327],
        [-1.3627,  1.3564,  0.6688, -0.7077, -0.3267]], requires_grad=True)
tensor([3, 4, 4])


In [None]:
aaa

In [None]:
loss = torch.nn.CrossEntropyLoss()
loss(x, y.cuda())

tensor(1.6028, device='cuda:0')

In [None]:
torch.argmax(x, dim=1)

tensor([0, 2, 4], device='cuda:0')

In [None]:
softmax(x)

tensor([[0.2183, 0.1914, 0.1919, 0.2114, 0.1871],
        [0.1942, 0.2024, 0.2090, 0.2019, 0.1925],
        [0.2004, 0.2083, 0.1953, 0.1815, 0.2146]], device='cuda:0')

In [None]:
softmax = torch.nn.Softmax(dim=1)
sum(softmax(x)[0])

tensor(1., device='cuda:0')

In [None]:
y.reshape([5, 3])

tensor([[0.2480, 0.0207, 0.1035],
        [0.1167, 0.0618, 0.1424],
        [0.1192, 0.0939, 0.0778],
        [0.2159, 0.0595, 0.0044],
        [0.0940, 0.0116, 0.1723]], device='cuda:0')

In [None]:
y.reshape([5, 3]).transpose(0, 1)[0]

tensor([0.2480, 0.1167, 0.1192, 0.2159, 0.0940], device='cuda:0')

In [None]:
softmax = torch.nn.Softmax(dim=1)
res = softmax(y.reshape([5, 3]))
print(res)

tensor([[0.3756, 0.2993, 0.3251],
        [0.3364, 0.3184, 0.3452],
        [0.3408, 0.3323, 0.3270],
        [0.3753, 0.3210, 0.3038],
        [0.3331, 0.3067, 0.3602]], device='cuda:0')


In [None]:
res[0]

tensor([0.2183, 0.1942, 0.2004], device='cuda:0')

In [None]:
# train_dataloader = DataLoader(
#             train_dataset,  # The training samples.
#             sampler = RandomSampler(train_dataset), # Select batches randomly
#             batch_size = batch_size # Trains with this batch size.
#         )

# # For validation the order doesn't matter, so we'll just read them sequentially.
# validation_dataloader = DataLoader(
#             val_dataset, # The validation samples.
#             sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
#             batch_size = batch_size # Evaluate with this batch size.
#         )

NameError: ignored

In [None]:
example_query = "21-year-old sexually active male fever pain urination inflammation pain in the right knee culture joint bacteria not ferment maltose polysaccharide capsule physician orders antibiotic therapy patient mechanism of action medication given blocks cell wall synthesis following given"
example_top_n_documents = [{'score': 42.14371,
  'evidence': {'name': 'Biochemistry_Lippincott.txt0',
   'content': '2.2. a 42-year-old male patient undergoing radiation therapy for prostate cancer develops severe pain in the metatarsal phalangeal joint of his right big toe.'}},
 {'score': 40.0278,
  'evidence': {'name': 'First_Aid_Step2.txt0',
   'content': 'an active 13-year-old boy has anterior knee pain.'}},
 {'score': 34.522305,
  'evidence': {'name': 'Anatomy_Gray.txt0',
   'content': 'a 45-year-old man came to his physician complaining of pain and weakness in his right shoulder.'}},
 {'score': 33.678368,
  'evidence': {'name': 'InternalMed_Harrison.txt0',
   'content': 'perihepatitis should be suspected in young, sexually active women who develop right-upper-quadrant pain, fever, or nausea.'}},
 {'score': 32.35567,
  'evidence': {'name': 'Biochemistry_Lippincott.txt0',
   'content': 'case 7: joint pain\n\npatient presentation: ir is a 22-year-old male who presents for follow-up 10 days after having been treated in the emergency department (ed) for severe inflammation at the base of his thumb.'}},
 {'score': 32.258705,
  'evidence': {'name': 'InternalMed_Harrison.txt0',
   'content': 'interferon γ was successful in the treatment of a 3-year-old boy with prolonged fever, abdominal pain, and thrombocytopenia due to\n\nc. burnetii that had not been eradicated with conventional antibiotic therapy.'}},
 {'score': 30.173576,
  'evidence': {'name': 'InternalMed_Harrison.txt0',
   'content': 'the azoles’ mechanism of action is inhibition of ergosterol synthesis in the fungal cell wall.'}},
 {'score': 29.039585,
  'evidence': {'name': 'InternalMed_Harrison.txt0',
   'content': 'a 45-year-old woman receiving high-dose glucocorticoids developed right hip pain.'}},
 {'score': 28.646845,
  'evidence': {'name': 'InternalMed_Harrison.txt0',
   'content': 'in common with other gram-positive bacteria, pneumococci have a cell membrane beneath a cell wall, which in turn is covered by a polysaccharide capsule.'}},
 {'score': 28.138422,
  'evidence': {'name': 'Pharmacology_Katzung.txt0',
   'content': 'see chapter 52.\n\nantagonize the action of bactericidal cell wall-active agents because cell wall-active agents require that the bacteria be actively growing and dividing.'}}]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = AutoModel.from_pretrained('bert-base-cased')
# model_qa = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
# model_multiple = AutoModelForMultipleChoice.from_pretrained('bert-base-cased')
# Displaying the information about the particular layer's weights
# model_multiple.bert.encoder.layer[0].attention.self.query.weight
# model.encoder.layer[0].attention.self.query.weight == model_multiple.bert.encoder.layer[0].attention.self.query.weight

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




Using cuda device


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…




In [None]:
bert_base_uncased = AutoModel.from_pretrained('bert-base-cased') 
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch. If the previous second dimension of the output was 125 i wonder what will be the new one
"""

questions = [
             {
                 "question": "21-year-old sexually active male fever pain urination inflammation pain in the right knee culture joint bacteria not ferment maltose polysaccharide capsule physician orders antibiotic therapy patient mechanism of action medication given blocks cell wall synthesis following given",
                 "options": {
                     "A": "chloramphenicol",
                     "B": "centamicin",
                     "C": "ciprofloxacin",
                     "D": "ceftriaxone",
                     "E": "trimethoprim"
                 },
                  "correct_answer": "A"
             }
            #  {
            #      "question": "How many pretrained models are available in 🤗 Transformers?",
            #     "options": {
            #         "A": "over 20",
            #         "B": "over 30",
            #         "C": "less than 20",
            #         "D": "less than 30"
            #     },
            #     "correct_answer": "B"
            #  }

    #  "What does 🤗 Transformers provide?",
    #  "🤗 Transformers provides interoperability between which frameworks?",
]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
             {
                 "question": "How many pretrained models are available in 🤗 Transformers?",
                "options": {
                    "A": "over 20",
                    "B": "over 32+",
                    "C": "less than 20",
                    "D": "less than 30"
                }
             },
             {
             "question": "What does 🤗 Transformers provide?",
                "options": {
                    "A": "deep learning magic", 
                    "B": "general - purpose architectures", 
                    "C": "general knowledge", 
                    "D": "nothing in general"
                }
             }

    #  "What does 🤗 Transformers provide?",
    #  "🤗 Transformers provides interoperability between which frameworks?",
]

In [None]:
x1 = torch.Tensor([1, 2])
x2 = torch.Tensor([1, 2])
xs = [x1, x2]

y = torch.cat(xs, dim=0)
print(y)

tensor([1., 2., 1., 2.])


In [None]:
for question_option in questions:
    question = question_option["question"]
    print(question)
    answer_outputs = []
    for answer in question_option['options'].values():
        qa = question + ' ' + answer
        
        input = tokenizer(text, qa, add_special_tokens=True, return_tensors="pt").to(device)

    # input_ids = inputs["input_ids"].tolist()[0]
    # print(tokenizer.decode(input_ids))
        input_ids = input["input_ids"]
        input_token_type_ids = input["token_type_ids"]
        input_attention_mask = input["attention_mask"]
        
        output = model(input_ids=input_ids, 
                        attention_mask=input_attention_mask,
                        token_type_ids=input_token_type_ids)
        answer_outputs.append(output)

    answer_probs = model.softmax(torch.FloatTensor(answer_outputs))
    chosen_answer_idx = torch.argmax(answer_probs).item()
    
    print(chosen_answer_idx)
    # print(torch.eq(outputs.last_hidden_state, outputs2.last_hidden_state))
    # print(torch.eq(outputs, outputs2))
    # print(type(outputs))
    # print(outputs)
    # answer_start_scores = outputs.start_logits
    # answer_end_scores = outputs.end_logits
    # answer_start = torch.argmax(
    #     answer_start_scores
    # )  # Get the most likely beginning of answer with the argmax of the score
    # answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    # answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    # print(f"Question: {question}")
    # print(f"Answer: {answer}")
# Question: How many pretrained models are available in 🤗 Transformers?
# Answer: over 32 +
# Question: What does 🤗 Transformers provide?
# Answer: general - purpose architectures
# Question: 🤗 Transformers provides interoperability between which frameworks?
# Answer: tensorflow 2 . 0 and pytorch

How many pretrained models are available in 🤗 Transformers?


NameError: ignored

In [None]:
answer_outputs
# answer_outputs[0].detach().numpy()[0]
# import torch.nn.functional as F
# answer_outputs

[tensor([0.4984], device='cuda:0', grad_fn=<AddBackward0>),
 tensor([0.5120], device='cuda:0', grad_fn=<AddBackward0>),
 tensor([0.4534], device='cuda:0', grad_fn=<AddBackward0>),
 tensor([0.3959], device='cuda:0', grad_fn=<AddBackward0>)]

tensor([-0.3450, -0.4074, -0.3031, -0.3128])
tensor([0.2491, 0.2340, 0.2597, 0.2572])
tensor(2)
2


In [None]:
x(answer_outputs)

AttributeError: ignored

In [None]:

data = torch.randn(5)
print(data)
print(F.softmax(data, dim=0))
print(F.softmax(data, dim=0).sum())  # Sums to 1 because it is a distribution!
print(F.log_softmax(data, dim=0))  # theres also log_softmax

tensor([ 0.4314, -1.4257,  1.5849, -0.7597,  1.6632])
tensor([0.1241, 0.0194, 0.3934, 0.0377, 0.4254])
tensor(1.)
tensor([-2.0865, -3.9436, -0.9330, -3.2776, -0.8547])


In [None]:
answer_outputs[0]

tensor([-0.2288], grad_fn=<AddBackward0>)

In [None]:
torch.unsqueeze(answer_outputs[0], dim=0)

tensor([[-0.2288]], grad_fn=<UnsqueezeBackward0>)

In [None]:
test = torch.FloatTensor([1,2,3])
x(test)

tensor([0.0900, 0.2447, 0.6652])

In [None]:


softmax = torch.nn.Softmax(dim=1)
softmax(answer_outputs)

AttributeError: ignored

In [None]:
c = "This would be text from the MedQA paper retrieved. Probably by using the IR system from MedQA first."
qa = "This is the question, right? Here is the answer"

test_input = tokenizer(c, qa, add_special_tokens=True, return_tensors="pt")
test_input_ids = test_input['input_ids'].tolist()[0]

# test_outputs = model(**test_input)
# print(type(test_outputs))
# print(test_outputs.last_hidden_state.shape)
# print(test_outputs.last_hidden_state)


test_output_multiple = model_multiple(**test_input)

# print(type(test_output_multiple))
# print(test_outputs.last_hidden_state.shape)
# print(test_outputs.last_hidden_state)

NameError: ignored

In [None]:
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = [
             {
                 "question": "How many pretrained models are available in 🤗 Transformers?",
                "options": {
                    "A": "over 20",
                    "B": "over 30",
                    "C": "less than 20",
                    "D": "less than 30"
                },
                "correct_answer": "B"
             }

    #  "What does 🤗 Transformers provide?",
    #  "🤗 Transformers provides interoperability between which frameworks?",
]

for question_option in questions:
    question = question_option["question"]

    inputs = tokenizer(question, text, 
                       add_special_tokens=True,
                       return_tensors="pt"
                       )
    input_ids = inputs["input_ids"].tolist()[0]
    outputs = model(**inputs)
    print(type(input_ids))
    print(tokenizer.decode(input_ids))
    # answer_start_scores = outputs.start_logits
    # answer_end_scores = outputs.end_logits
    # answer_start = torch.argmax(
    #     answer_start_scores
    # )  # Get the most likely beginning of answer with the argmax of the score
    # answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    # answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    # print(f"Question: {question}")
    # print(f"Answer: {answer}")
# Question: How many pretrained models are available in 🤗 Transformers?
# Answer: over 32 +
# Question: What does 🤗 Transformers provide?
# Answer: general - purpose architectures
# Question: 🤗 Transformers provides interoperability between which frameworks?
# Answer: tensorflow 2 . 0 and pytorch

<class 'list'>
[CLS] How many pretrained models are available in [UNK] Transformers? [SEP] [UNK] Transformers ( formerly known as pytorch - transformers and pytorch - pretrained - bert ) provides general - purpose architectures ( BERT, GPT - 2, RoBERTa, XLM, DistilBert, XLNet … ) for Natural Language Understanding ( NLU ) and Natural Language Generation ( NLG ) with over 32 + pretrained models in 100 + languages and deep interoperability between TensorFlow 2. 0 and PyTorch. [SEP]


In [None]:
text = r"""
🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
"""

questions = {
     "How many pretrained models are available in 🤗 Transformers?",
     "What does 🤗 Transformers provide?",
     "🤗 Transformers provides interoperability between which frameworks?",
}

for question in questions:
    inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
    input_ids = inputs["input_ids"].tolist()[0]
    outputs = model(**inputs)
    print(outputs.to_tuple())
    break
    # answer_start_scores = outputs.start_logits
    # answer_end_scores = outputs.end_logits
    # answer_start = torch.argmax(
    #     answer_start_scores
    # )  # Get the most likely beginning of answer with the argmax of the score
    # answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
    # answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    # print(f"Question: {question}")
    # print(f"Answer: {answer}")
# Question: How many pretrained models are available in 🤗 Transformers?
# Answer: over 32 +
# Question: What does 🤗 Transformers provide?
# Answer: general - purpose architectures
# Question: 🤗 Transformers provides interoperability between which frameworks?
# Answer: tensorflow 2 . 0 and pytorch

(tensor([[[ 0.3589, -0.0604, -0.5795,  ...,  0.0564,  0.6875,  0.4772],
         [ 0.7745, -0.5458,  0.7483,  ..., -0.3884, -0.1324,  0.0269],
         [ 0.3030,  0.1737,  0.1765,  ...,  0.6794,  0.1231,  0.1231],
         ...,
         [-0.1903, -0.0120,  0.4077,  ...,  0.2297, -0.3722, -0.2795],
         [ 0.9130, -0.2462, -0.2226,  ...,  0.8263, -0.1436, -0.2555],
         [ 0.9229, -0.2293, -0.1734,  ...,  0.8629, -0.1519, -0.2260]]],
       grad_fn=<NativeLayerNormBackward>), tensor([[-0.9746,  0.9100,  1.0000, -0.9990,  0.9973, -0.9963,  0.9998,  0.9964,
         -0.9986, -0.9374,  0.9994,  0.9993,  0.9973, -1.0000, -0.9992, -0.9989,
          0.9992, -0.9526, -1.0000,  0.9767,  0.8413, -1.0000,  0.8745, -0.9826,
          0.9988,  0.0665,  0.9989,  1.0000,  0.9918,  0.9543,  0.8465, -0.9987,
         -0.9928, -0.9999,  0.8127, -0.8492, -0.9865, -0.8104, -0.9817,  0.9884,
         -0.9820,  0.9928, -0.9175, -0.9273, -0.9886,  0.8496,  0.8637, -0.1282,
         -0.8642,  1.0000, -

In [None]:
from transformers import DPRReader, DPRReaderTokenizer
tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
encoded_inputs = tokenizer(
        questions=["What is love ?"],
        titles=["Haddaway", "Test"],
        texts=["'What Is Love' is a song recorded by the artist Haddaway", "This is a test ting"],
        return_tensors='pt'
    )
outputs = model(**encoded_inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits
relevance_logits = outputs.relevance_logits

In [None]:
print(relevance_logits)

tensor([-1.2456], grad_fn=<ViewBackward>)


In [None]:
tokenizer.decode()