In [1]:
from transformers import AutoTokenizer, AutoModel
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import tqdm
import pickle, random
import torch, os, pytorch_lightning as pl, glob
import torch.distributed as dist
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer
import torch.nn.functional as F
from datasets import load_dataset

### downloading dataset:
dataset_together = load_dataset("openlifescienceai/medmcqa")

### downloading model:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

model_name = "healx/gpt-2-pubmed-medium"
max_length = 100
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.padding_side='left'
tokenizer.pad_token = tokenizer.eos_token
print(f'[Main] using GPU : {torch.cuda.get_device_name()}')


split = "val"


    

  from .autonotebook import tqdm as notebook_tqdm


[Main] using GPU : NVIDIA TITAN RTX


In [2]:
batchsize = 12
max_length = 100


In [10]:
assert(split in ["val", "train", "test"])

split_for_dataset = split if split != "val" else "validation"
dataset = dataset_together[split_for_dataset]
questions = dataset['question']

hint_prompt='Some hints:'
input_prompts_hint = list(map(lambda q: f"{q} {hint_prompt}",questions))
len_of_each_question = list(map(lambda x: len(x), input_prompts_hint))
# generating hints 
questions_with_hints = []
for i in tqdm.tqdm(range(len(input_prompts_hint)//batchsize + 1)):
    # print(i," out of ", len(input_prompts_hint)//batchsize + 1)
    begin, end = batchsize*i, min(batchsize*(i+1), len(input_prompts_hint))
    inputs = tokenizer(input_prompts_hint[begin : end], 
                        return_tensors="pt",  padding=True, truncation=True).to(device)

    # Generate text by feeding the encoded input through the model and sampling output tokens
    outputs = model.generate(input_ids=inputs["input_ids"],
                            attention_mask=inputs["attention_mask"], 
                            max_length=max_length+inputs["input_ids"].shape[1], 
                            num_beams=10, early_stopping=True,
                            repetition_penalty=4.2,
                            pad_token_id=50256
                            )
    generated_text = tokenizer.batch_decode(outputs, 
                                    skip_special_tokens=True
                                    )
    generated_text_one_sentence = list(map(lambda x, l:x[l:].split('.')[0] , 
                                           generated_text,
                                       len_of_each_question[begin : end]))
    break


    questions_with_hints+=generated_text_one_sentence
# generated_text_one_sentence

      

  0%|          | 0/349 [00:04<?, ?it/s]


In [11]:
list(map(lambda x, l:x[l:].split('.')[0] , 
                                           generated_text,
                                       len_of_each_question[begin : end]))

[' Nerve conduction studies (NCSs) can be used to assess peripheral nerve function',
 ' Glomerular filtration rate (eGFR) has been shown to be an important predictor of mortality in patients with chronic kidney disease (CKD)',
 ' In this case report, we would like to draw attention to the importance of early diagnosis and treatment of Down syndrome in order to improve the quality of life for both mother and child',
 ' The aim of this study was to investigate the effect of a high-fat diet (HFD) on axonal transport in rats',
 ' Insulin-like growth factor 1 (IGF-1) and IGF-binding protein 3 (IGFBP-3) have been shown to play an important role in the regulation of glucose homeostasis',
 ' A case report',
 " The term 'H I N1 Influenza' has been used for a long time to describe influenza-like illness (ILI)",
 " \u2002Kiesselbach's plexus can be divided into two groups according to its origin from internal carotid artery (ICA) or external carotid artery (ECA)",
 ' A case report',
 ' this case 

In [12]:
questions[:8]

['Which of the following is not true for myelinated nerve fibers:',
 "Which of the following is not true about glomerular capillaries')",
 'A 29 yrs old woman with a pregnancy of 17 week has a 10 years old boy with down syndrome. She does not want another down syndrome kid; best advice to her is',
 'Axonal transport is:',
 'Low insulin to glucagon ratio is seen in all of these except:',
 'Concentration of tropicamide:',
 'Which of the following statements is true regarding H I N1 Influenza?',
 "Which of the following are not a branch of external carotid Aery in Kiesselbach's plexus."]

In [28]:
  
hints = questions_with_hints
# tokenize everything
A,B,C,D = dataset['opa'], dataset['opb'],dataset['opc'],dataset['opd']
subject_names = dataset['subject_name']
topic_names = dataset['topic_name']

labels = dataset['cop']

finetuning_tokenizer = BertTokenizer.from_pretrained('microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext')
    
input_prompts = list(map(lambda a,b,c,d,q,h,subject,topic: f"Subject: {subject}, Topic: {topic}\nQuestion: Given{h}, {q}\nA: {a}\nB: {b}\nC: {c}\nD: {d}\n",
                        A,B,C,D,questions, hints, subject_names, topic_names))

tokens = finetuning_tokenizer(input_prompts, padding=True, truncation=True, return_tensors="pt")
    

        
data_with_hints = torch.utils.data.TensorDataset(tokens.input_ids, tokens.attention_mask, torch.tensor(labels))
torch.save(data_with_hints, '/root/pubmedQA_291/dataset_pickles/medmcqa/hints-added-tokenized-by-pubmed-bert/classification_style/'+split+'.pt')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
