## 1. Import Libraries

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install evaluate
!pip install sacrebleu
!pip install rouge_score



In [3]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import os
import torch
from tqdm import tqdm
# from datasets import metric
import evaluate

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
MAX_LENGTH=500
BATCH_SIZE=16

## 2. Load Dataset & Prepocess the Data
- Load Train and Test dataset
- Truncate the answers to MAX_LENGTH.
- Concat Question and Answer using special token, and add start and end token.
-  Repeate the above two steps for Train and Test


In [6]:
train_test_dict={"train":{},"test":{}}

In [7]:
tokenizer=AutoTokenizer.from_pretrained('distilgpt2', add_special_tokens=True,additional_special_tokens=['[response]'], pad_token="[Pad]",padding_side='left')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
bos_token=tokenizer.decode(tokenizer.bos_token_id)
eos_token=tokenizer.decode(tokenizer.eos_token_id)

In [9]:
for key in train_test_dict:
    dataset_pd=pd.read_csv(f'/content/drive/MyDrive/NLP_Project/{key}_datasets/combined_{key}.csv')
    dataset_pd=dataset_pd.astype('string')
    dataset_pd['Question']=bos_token + dataset_pd['Question'] + ' [response] '
    dataset_pd['QA_pairs'] = dataset_pd['Question']  + dataset_pd['Answer_cut'] + eos_token
    dataset_list=list(dataset_pd['QA_pairs'])
    questions_list=list(dataset_pd['Question'])
    train_test_dict[key]['dataset']=dataset_pd
    train_test_dict[key]['question_list']=questions_list
    train_test_dict[key]['QA_list']=dataset_list

## 3. Initialize pytorch dataset and dataloaders

In [10]:
class MedicalDataset(Dataset):
    def __init__(self, data,  tokenizer, questions):
        self.tokenizer = tokenizer
        self.data=self.tokenizer(data, padding=True, truncation=True ,return_tensors='pt')
        self.non_tokenized_data= data
        self.tokenized_questions=self.tokenizer(questions, padding=True, truncation=False, return_tensors='pt')
    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        return self.data['input_ids'][idx], self.data['attention_mask'][idx], self.non_tokenized_data[idx],self.tokenized_questions['input_ids'][idx], self.tokenized_questions['attention_mask'][idx]



In [11]:
train_dataset = MedicalDataset(train_test_dict['train']['QA_list'], tokenizer, train_test_dict['train']['question_list'])
test_dataset = MedicalDataset(train_test_dict['test']['QA_list'], tokenizer, train_test_dict['test']['question_list'])

In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

## 4. Define the model
- Resize the total number of model's embeddings  as we added a special token ('['response']') for sperating the question from the answer.
- Set the model's hyperparams, optimizer and the learning rate scheduler.

In [13]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')
model.resize_token_embeddings(len(tokenizer))
model=model.to(device)

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [14]:
epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler=get_linear_schedule_with_warmup(optimizer, num_warmup_steps = warmup_steps, num_training_steps= epochs*(len(train_loader)))
grad_accumulatio_steps=2

## 6. Model Training

In [None]:
from datetime import datetime

date_string = 'fine_tune_model_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
train_loss_values=[]
test_loss_values=[]
ctr=1
for epoch in range(epochs): # number of epochs
    model.train()
    train_loss=0
    model.train()
    for input_ids, attention_mask,raw_data, toknized_questions_id,toknized_questions_mask in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        train_loss+=loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        # if (grad_accumulatio_steps != 0) and  ((grad_accumulatio_steps %grad_accumulatio_steps) ==0):
        #     optimizer.step()
        #     scheduler.step()
        #     ctr+=1
    model.eval()
    val_loss=0
    with torch.no_grad():
      for input_ids, attention_mask, raw_data, toknized_questions_id,toknized_questions_mask in tqdm(test_loader):
        input_ids=input_ids.to(device)
        attention_mask=attention_mask.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        val_loss += outputs.loss.item()
    avg_val_loss = val_loss / len(test_loader)
    test_loss_values.append(avg_val_loss)


    avg_train_loss = train_loss / len(train_loader)
    train_loss_values.append(avg_train_loss)

    print(f"Epoch {epoch}, train_loss: {avg_train_loss}, val_loss: {avg_val_loss}")


In [None]:
model.save_pretrained('/content/drive/MyDrive/NLP_Project/models/fineTune_models/Both_Latest_fineTuned_gpt2_3epoch'+date_string)

## 7. Model Evaluation


In [17]:
model=GPT2LMHeadModel.from_pretrained('/content/drive/MyDrive/NLP_Project/models/fineTune_models/Both_left_pad_fineTuned_gpt2_3epochfine_tune_model_2024_04_06_16_40_42/ ').to(device)


In [20]:
model.eval()
bleu=evaluate.load('sacrebleu')
rouge=evaluate.load('rouge')

for _, _, raw_data, toknized_questions_id,toknized_questions_mask in tqdm(test_loader):
    input_ids=toknized_questions_id.to(device)
    attention_mask=toknized_questions_mask.to(device)
    answers=[qa.split('[response]')[-1] for qa in raw_data]
    outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask,  pad_token_id=tokenizer.pad_token_id,max_new_tokens=MAX_LENGTH)
    decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    for pred, ref in zip(answers, decoded_outputs):
        bleu.add(references = ref, predictions=pred)
        rouge.add(references = ref, predictions=pred)

bleu=bleu.compute()
rouge=rouge.compute()
print(f"Bleu: {bleu['score']},  Rouge:{rouge['rouge1']}, Rouge:{rouge['rougeL']}")

100%|██████████| 577/577 [31:27<00:00,  3.27s/it]


Bleu: 4.12,  Rouge:0.117, Rouge:0.103


### 7.3) Chekcing the reponse for any one question.

In [35]:
output_1391=model.generate(input_ids=torch.tensor(tokenizer(questions_list[145])['input_ids']).reshape(1,-1).to(device),attention_mask=torch.tensor(tokenizer(questions_list[145])['attention_mask']).reshape(1,-1).to(device), pad_token_id=tokenizer.pad_token_id,max_new_tokens=MAX_LENGTH)

In [36]:
tokenizer.decode(tokenizer(bos_token+questions_list[145])['input_ids'][1:],skip_special_tokens=False)

'<|endoftext|>What are the symptoms of Carnosinemia? [response] '

In [37]:
tokenizer.decode(output_1391[0], skip_special_tokens=False)

'<|endoftext|>What are the symptoms of Carnosinemia? [response]  What are the signs and symptoms of Carnosinemia? The Human Phenotype Ontology provides the following list of signs and symptoms for Carnosinemia. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metacarpal bones 90% Abnormality of the metac