In [1]:
#Importing Necessary Libraries
!pip install --upgrade transformers peft datasets
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
from datasets import Dataset
from peft import get_peft_model, LoraConfig, TaskType
import os
import wandb

Collecting transformers
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Downloading transformers-4.49.0-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[?25hDownloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers, datasets
  Attempting uninstall: transformers
    Found existing installation: transformers 4.47.0
    Uninstalling transformers-4.47.0:
      Successfully uninstalled transformers-4.47.0
  Attempting uninstall: datasets
    Found existing installation: datasets 3.3.1
    Unins

In [2]:
#Weights and Bias site API key to store model's performance report
os.environ['WANDB_API_KEY'] = 'ca9ac32044229f8313d5c4be93b76460c51fba7b'
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33muthandaramu[0m ([33muthandaramu-self[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
#Choosing available device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load specific range of medical reasoning dataset from hugging face
dataset_core = load_dataset('FreedomIntelligence/medical-o1-reasoning-SFT', 'en', split="train")
dataset = dataset_core.select(range(2000))
split_dataset = dataset.train_test_split(test_size=0.2)
#Splitting the dataset between training(80%) and validation(20%)
train_dataset = split_dataset['train']
val_dataset = split_dataset['test']
# Initialize the tokenizer and set the padding token
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', device=device)
tokenizer.pad_token = tokenizer.eos_token

README.md:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

medical_o1_sft.json:   0%|          | 0.00/74.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25371 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [4]:
dataset['Question'][2]

'A 45-year-old man presents with symptoms including a wide-based gait, a blank facial expression, hallucinations, memory issues, a resting tremor that resolves with movement, and bradykinesia. Based on these clinical findings, what is most likely to be observed in the histological specimen of his brain?'

In [4]:
#Dataset preprocess formating
def preprocess_function(examples):
    inputs = [q + " " + a for q, a in zip(examples['Question'], examples['Response'])]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length', return_tensors = "pt")
    #for context in examples['Response']:
        # Tokenize the context
    model_inputs["labels"] = model_inputs["input_ids"].clone()
    model_inputs["labels"][model_inputs["labels"] == tokenizer.pad_token_id] = -100
    return model_inputs

In [5]:
# Apply the preprocessing function to the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

tokenized_train_dataset

In [7]:
print ("Input ID")
print(tokenized_train_dataset['input_ids'][0])
print ("Lables")
print(tokenized_train_dataset['labels'][0])
print("attention Mask")
print(tokenized_train_dataset['attention_mask'][0])

Input ID
[32, 1679, 12, 1941, 12, 727, 4257, 10969, 351, 1029, 12, 9526, 17372, 290, 8813, 3004, 11, 351, 2248, 2482, 12739, 16869, 49835, 379, 642, 308, 14, 45582, 11, 2472, 443, 2724, 43320, 954, 286, 50138, 14, 3020, 18, 11, 290, 22577, 954, 286, 362, 4, 34196, 82, 11, 9907, 4, 28837, 30309, 11, 290, 362, 4, 304, 418, 259, 2522, 4487, 13, 1867, 318, 257, 3513, 3038, 326, 815, 307, 13941, 287, 428, 8668, 8883, 30, 554, 428, 8668, 8883, 11, 12886, 269, 541, 305, 2704, 1140, 330, 259, 815, 307, 13941, 13, 11259, 262, 5827, 338, 10470, 351, 730, 65, 380, 293, 22190, 1773, 268, 544, 290, 2785, 384, 17459, 6380, 11, 45840, 516, 3662, 286, 22392, 318, 8780, 284, 4155, 5801, 290, 4050, 3513, 286, 16079, 13, 42222, 22392, 1244, 407, 307, 19233, 22668, 393, 719, 2952, 1576, 287, 428, 4688, 3074, 13, 14645, 278, 284, 8363, 22392, 561, 307, 257, 517, 5035, 1781, 286, 2223, 284, 6687, 262, 10280, 2526, 6840, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

In [7]:
#Importing pretrained gpt2 model from hugging ace
model = GPT2LMHeadModel.from_pretrained('gpt2', device_map = device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [8]:
#Lora configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=64,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["attn.c_attn", "attn.c_proj"]
)

model = get_peft_model(model, peft_config)



In [9]:
model.print_trainable_parameters()

trainable params: 3,538,944 || all params: 127,978,752 || trainable%: 2.7653


In [21]:
#Training Arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    evaluation_strategy='steps',
    eval_steps=100,
    warmup_steps=100,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
    learning_rate=5e-5,
    report_to="wandb"
)




In [22]:
#Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset
)


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [23]:
#Training Phase
trainer.train()

Step,Training Loss,Validation Loss
100,2.511,2.437794
200,2.4868,2.43696
300,2.5114,2.43695
400,2.4788,2.434615
500,2.5195,2.433593
600,2.5088,2.434842
700,2.4473,2.431433
800,2.4739,2.430652
900,2.476,2.430881
1000,2.4486,2.432572


TrainOutput(global_step=6000, training_loss=2.4592585093180337, metrics={'train_runtime': 2373.7278, 'train_samples_per_second': 10.111, 'train_steps_per_second': 2.528, 'total_flos': 6531928031232000.0, 'train_loss': 2.4592585093180337, 'epoch': 15.0})

In [24]:
test_question = "What is the most likely diagnosis for a 2-year-old 70 kg child who presents with limitation of abduction and internal rotation, tenderness in Scarpa's triangle, and abduction of the limb upon flexing the hip?"
tokenized_test = tokenizer(test_question, return_tensors="pt").to(device)
print(tokenizer.batch_decode(model.generate(tokenized_test['input_ids'], max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)))

["What is the most likely diagnosis for a 2-year-old 70 kg child who presents with limitation of abduction and internal rotation, tenderness in Scarpa's triangle, and abduction of the limb upon flexing the hip? The most likely diagnosis for a 2-year-old 70 kg child who presents with limitation of abduction and internal rotation, tenderness in Scarpa's triangle, and abduction of the limb upon flexing the hip is a malignant neoplasia. Malignant neoplasia is a common cause of malformation in children, particularly in children with a history of abduction and internal rotation. It can lead to a variety of complications, including the development of a malignant tumor, which can lead to a malignant condition like a malignant neoplasia. Malignant neoplasia can also be a sign of a malignant tumor, which can lead to a malignant condition like a malignant neoplasia. This condition can lead to a"]
