### LoRA Fine Tuning 

In [1]:
import os
import json
import torch
from pathlib import Path
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoModelForCausalLM,
    Seq2SeqTrainer,              
    Seq2SeqTrainingArguments,    
    DataCollatorForSeq2Seq,
    BitsAndBytesConfig
)
from peft import (
	LoraConfig,
	get_peft_model,
	prepare_model_for_kbit_training,
	TaskType
)
import numpy as np
from evaluate import load

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_CONFIGS = {
	'flan-t5-base': {
		'name': 'google/flan-t5-base',
		'type': 'seq2seq',
		'quantization': None
	}
}

SELECTED_MODEL = 'flan-t5-base'

DATA_DIR = Path('../data/finetuning/')
OUTPUT_DIR = Path('../models/drug_qna_lora')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

LORA_CONFIG = {
		'r': 8,  # LoRA rank (higher = more parameters, but slower) : 4, 8, 16
		'lora_alpha': 32,  # LoRA scaling factor
		'lora_dropout': 0.05,  # Dropout for LoRA layers
		'target_modules': None,  # Will be set based on model
}

TRAINING_CONFIG = {
		'num_epochs': 5,
		'batch_size': 8,  # Adjust based on VRAM : 2, 4, 8
		'gradient_accumulation_steps': 2,  # Effective batch size = 16
		'per_device_eval_batch_size': 4,
		'learning_rate': 1e-4,
		'max_length': 256,
		'warmup_ratio': 0.03,
}

### Load dataset for finetuning

In [3]:
# because the finetuning dataset is small, we can use either json or jsonl
def load_dataset_from_json(split = 'train'):
	file_path = DATA_DIR / f'{split}.json'
	if not file_path.exists():
		raise FileNotFoundError(
			f"Dataset file not found at {file_path}"
			"Please run preprocessing.ipynb first"
		)
	
	with open(file_path, 'r', encoding='utf-8') as f:
		data = json.load(f)
	
	print("Loaded")
	return Dataset.from_list(data)

In [4]:
def load_dataset_from_jsonl(split='train'):
	file_path = DATA_DIR / f"{split}.jsonl"  
	
	if not file_path.exists():
		raise FileNotFoundError(
			f"Dataset not found at {file_path}. "
			"Please run data preprocessing first."
		)
	
	# Read JSONL line-by-line
	data = []
	with open(file_path, 'r', encoding='utf-8') as f:
		for line in f:
			data.append(json.loads(line))
	
	print(f"Loaded {len(data)} examples from {split} set")
	return Dataset.from_list(data)


### Setup Model

In [5]:
def setup_model_and_tokenizer(model_config):
	print(f"Setting up model for {model_config['name']}")
	model_name = model_config['name']
	model_type = model_config['type']
	
	# load tokenizer 
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
	
	# Add padding token if missing
	# pad token is used to give every input the same length according to longest input in the batch
	if tokenizer.pad_token is None:
		tokenizer.pad_token = tokenizer.eos_token
	
	# setup quantization config if needed
	quantization_config = None
	if model_config['quantization'] == '4bit':
		quantization_config = BitsAndBytesConfig(
			load_in_4bit=True,
			bnb_4bit_quant_type="nf4",
			bnb_4bit_compute_dtype=torch.float16,
			bnb_4bit_use_double_quant=True,
		)
	
	# load model 
	if model_type == 'seq2seq':
		model = AutoModelForSeq2SeqLM.from_pretrained(
			model_name,
			quantization_config=quantization_config,
			device_map='auto',
			trust_remote_code=True
		)
		task_type = TaskType.SEQ_2_SEQ_LM
		# target modules for T5
		target_modules = ['q', 'v']
	else: # causal LM or auto-regressive LM like GPT-2 , Phi-2
		model = AutoModelForCausalLM.from_pretrained(
			model_name,
			quantization_config=quantization_config,
			device_map="auto",
			trust_remote_code=True,
			torch_dtype=torch.float16
		)
		task_type = TaskType.CAUSAL_LM
		target_modules = ['q_proj', 'v_proj']
	
	print(f"Model loaded: {model_name}")

	# prepare model for k-bit training if quantized
	if quantization_config:
		model = prepare_model_for_kbit_training(model)
		print("Model prepared for k-bit training")
	
	# setup LoRA config
	lora_config = LoraConfig(
		r=LORA_CONFIG['r'],
		lora_alpha=LORA_CONFIG['lora_alpha'],
		target_modules=target_modules,
		lora_dropout=LORA_CONFIG['lora_dropout'],
		bias="none",
		task_type=task_type
	)

	model = get_peft_model(model, lora_config)

	# print trainable parameters
	trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
	total_params = sum(p.numel() for p in model.parameters())
	print(f"  Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
	print(f"  Total params: {total_params:,}")

	return model, tokenizer, model_type
		

### Preprocessing model input

In [6]:
# if using seq2seq model like flan-t5
def preprocess_function_seq2seq(examples, tokenizer, max_length):
	# inputs = examples['instruction']
	inputs = [f"answer the following medication question:\n{q}" for q in examples['instruction']]
	targets = examples['output']
	
	# Tokenize inputs
	model_inputs = tokenizer(
			inputs,
			max_length=max_length,
			truncation=True,
			padding='max_length'
	)
	
	# Tokenize targets
	with tokenizer.as_target_tokenizer():
		labels = tokenizer(
				targets,
				max_length=max_length,
				truncation=True,
				padding='max_length'
				# padding=False
		)
	
	# Replace padding token id with -100 in labels to ignore padding in loss
	labels_ids = labels['input_ids']
	for label_seq in labels_ids:
			for i in range(len(label_seq)):
					if label_seq[i] == tokenizer.pad_token_id:
							label_seq[i] = -100
							
	model_inputs['labels'] = labels_ids
	
	return model_inputs

# output structure : 
# {
#   'input_ids': [...],
#   'attention_mask': [...],
#   'labels': [...]
# }

In [7]:
def preprocess_function_causal(examples, tokenizer, max_length):
	"""
	Preprocess data for Causal LM models (Phi-2, TinyLlama)
	"""
	# Format: Instruction: {question}\n\nAnswer: {answer}
	texts = []
	for instruction, output in zip(examples['instruction'], examples['output']):
		text = f"Instruction: {instruction}\n\nAnswer: {output}"
		texts.append(text)
	
	# Tokenize
	model_inputs = tokenizer(
		texts,
		max_length=max_length,
		truncation=True,
		padding='max_length'
	)
	
	# For causal LM, labels are the same as input_ids
	model_inputs['labels'] = model_inputs['input_ids'].copy()
	
	return model_inputs

In [8]:
def prepare_dataset(tokenizer, model_type):
	print("Preparing dataset...")
	
	# load dataset
	train_dataset = load_dataset_from_json('train')
	test_dataset = load_dataset_from_json('test')
	
	# select preprocessing function 
	if model_type == 'seq2seq':
		preprocess_fn = lambda x : preprocess_function_seq2seq(x, tokenizer, TRAINING_CONFIG['max_length'])
	
	else: 
		preprocess_fn = lambda x : preprocess_function_causal(x, tokenizer, TRAINING_CONFIG['max_length'])
	
	# preprocess
	train_dataset = train_dataset.map(
		preprocess_fn, # for every batch data x , apply preprocess_fn
		batched=True, # using batched processing for speed not single sample at a time
		remove_columns=train_dataset.column_names
	)
	test_dataset = test_dataset.map(
		preprocess_fn,
		batched=True,
		remove_columns=test_dataset.column_names
	)

	print(f"Train dataset: 	{len(train_dataset)} examples")
	print(f"Test dataset: 	{len(test_dataset)} examples")

	return train_dataset, test_dataset
	

### Evaluation

In [9]:
import numpy as np
from evaluate import load

# load metrics sekali saja
rouge_metric = load("rouge")
bleu_metric = load("bleu")
f1_metric = load("f1")  # token-level F1

def compute_metrics(eval_pred, tokenizer):
    """
    eval_pred: EvalPrediction(predictions, label_ids)
    predictions: generated token IDs (already decoded by Seq2SeqTrainer)
    labels: label IDs
    """
    print("Computing metrics...")
    preds, labels = eval_pred
    
    # predictions are already token IDs from generation
    # If predictions are -100 padded, replace them
    if isinstance(preds, np.ndarray):
        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    
    # decode predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # decode labels; replace -100 with pad_token_id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Trim both sides
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    # ROUGE
    rouge_result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True
    )

    # BLEU
    bleu_result = bleu_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )

    # F1-score (token-level)
    def simple_f1(pred, label):
        pred_tokens = pred.split()
        label_tokens = label.split()
        common = set(pred_tokens) & set(label_tokens)
        if len(pred_tokens) == 0 or len(label_tokens) == 0:
            return 0.0
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(label_tokens)
        if precision + recall == 0:
            return 0.0
        return 2 * (precision * recall) / (precision + recall)

    f1_scores = [simple_f1(p, l) for p, l in zip(decoded_preds, decoded_labels)]
    f1_avg = float(np.mean(f1_scores))
    
    print("Metrics computed.")

    return {
        "rouge1": rouge_result["rouge1"],
        "rouge2": rouge_result["rouge2"],
        "rougeL": rouge_result["rougeL"],
        "bleu": bleu_result["bleu"],
        "f1": f1_avg
    }

### Training

In [None]:
def train_model(model, tokenizer, train_dataset, test_dataset, model_type):
	print("Starting Training...")
	
	# Training arguments
	training_args = Seq2SeqTrainingArguments(
		output_dir=str(OUTPUT_DIR),
		num_train_epochs=TRAINING_CONFIG['num_epochs'],
		per_device_train_batch_size=TRAINING_CONFIG['batch_size'],
		per_device_eval_batch_size=TRAINING_CONFIG['per_device_eval_batch_size'],
		gradient_accumulation_steps=TRAINING_CONFIG['gradient_accumulation_steps'],
		learning_rate=TRAINING_CONFIG['learning_rate'],

		# warmup_steps=TRAINING_CONFIG['warmup_steps'],
		warmup_ratio=TRAINING_CONFIG['warmup_ratio'],
		weight_decay=0.01,
    label_smoothing_factor=0.05, # 0.1 , 0.5

		logging_steps=100, # increase to reduce logging frequency : 10, 100
		eval_strategy="epoch",
		save_strategy="epoch",
		save_total_limit=2,
		load_best_model_at_end=True,
		metric_for_best_model="rouge1", # if using rouge1 then greater is better True, if using eval_loss then False

		greater_is_better=True, # because we use rouge1 and the higher the better
		fp16=False, # Disable mixed precision to avoid errors
		bf16=True, # Use bfloat16 if supported (i used rtx 4060 so i enable this)
		predict_with_generate=True,  # important for seq2seq to generate during evaluation, avoid list of list of float (logits) to decode() 
		generation_max_length=TRAINING_CONFIG['max_length'],
		optim="adamw_torch",
		report_to="none",  # Disable wandb, tensorboard
	)
	
	# Data collator
	data_collator = DataCollatorForSeq2Seq(
		tokenizer=tokenizer,
		model=model,
		padding=True
	)
	
	# Initialize trainer
	trainer = Seq2SeqTrainer(
		model=model,
		args=training_args,
		train_dataset=train_dataset,	
		eval_dataset=test_dataset,
		data_collator=data_collator,
		compute_metrics = lambda eval_pred : compute_metrics(eval_pred, tokenizer)
	)
	
	# Train
	print("Training started...")
	trainer.train()
	print("Training completed.")

	# clear cache before evaluate
	torch.cuda.empty_cache()

	# Evaluate
	print("Evaluating model...")
	eval_results = trainer.evaluate()
	print(f"Evaluation results: {eval_results}")
	for key, value in eval_results.items():
		print(f"  {key}: {value:.4f}")
	
	# save evaluation result
	with open(OUTPUT_DIR / "evaluation_results.json", 'w') as f:
		json.dump(eval_results, f, indent=2)
	
	# Save final model
	print("Saving model...")
	trainer.save_model(OUTPUT_DIR / "final")
	tokenizer.save_pretrained(OUTPUT_DIR / "final")
	
	print(f"Model saved to {OUTPUT_DIR / 'final'}")
	
	return trainer

### Main Pipeline

In [None]:
print("Fine tuning model...")
print(f"Selected model : {SELECTED_MODEL}")
print(f"Model config : {MODEL_CONFIGS[SELECTED_MODEL]}")

model_config = MODEL_CONFIGS[SELECTED_MODEL]

model, tokenizer, model_type = setup_model_and_tokenizer(model_config)

train_dataset, test_dataset = prepare_dataset(tokenizer, model_type)

trainer = train_model(model, tokenizer, train_dataset, test_dataset, model_type)

print(f"FINE TUNING COMPLETE. Model saved to {OUTPUT_DIR / 'final'}")

Fine tuning model...
Selected model : flan-t5-base
Model config : {'name': 'google/flan-t5-base', 'type': 'seq2seq', 'quantization': None}
Setting up model for google/flan-t5-base
Model loaded: google/flan-t5-base
  Trainable params: 884,736 (0.36%)
  Total params: 248,462,592
Preparing dataset...
Loaded
Loaded


Map: 100%|██████████| 3449/3449 [00:02<00:00, 1343.40 examples/s]
Map: 100%|██████████| 699/699 [00:00<00:00, 1358.07 examples/s]
The model is already on multiple devices. Skipping the move to device specified in `args`.


Train dataset: 	3449 examples
Test dataset: 	699 examples
Starting Training...
Training started...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bleu,F1
1,4.0307,3.559692,0.110115,0.015623,0.099892,0.008779,0.041341
2,3.735,3.256807,0.158803,0.05307,0.147189,0.038538,0.060716
3,3.579,3.12147,0.252317,0.144494,0.234268,0.097129,0.13715
4,3.5082,3.060488,0.302213,0.193377,0.277246,0.130816,0.181346
5,3.4563,3.042266,0.304649,0.194534,0.278225,0.133254,0.182185


Computing metrics...
Metrics computed.
Computing metrics...
Metrics computed.
Computing metrics...
Metrics computed.
Computing metrics...
Metrics computed.
Computing metrics...
Metrics computed.
Training completed.
Evaluating model...


Computing metrics...
Metrics computed.
Evaluation results: {'eval_loss': 3.042266368865967, 'eval_rouge1': 0.30464908702148036, 'eval_rouge2': 0.19453441359912327, 'eval_rougeL': 0.27822454534832114, 'eval_bleu': 0.1332543719158346, 'eval_f1': 0.18218516588043282, 'eval_runtime': 827.5791, 'eval_samples_per_second': 0.845, 'eval_steps_per_second': 0.211, 'epoch': 5.0}
  eval_loss: 3.0423
  eval_rouge1: 0.3046
  eval_rouge2: 0.1945
  eval_rougeL: 0.2782
  eval_bleu: 0.1333
  eval_f1: 0.1822
  eval_runtime: 827.5791
  eval_samples_per_second: 0.8450
  eval_steps_per_second: 0.2110
  epoch: 5.0000
Saving model...
Model saved to ..\models\drug_qna_lora\final
Testing inference...
Question: What is the dosage of Acetaminophen for adults?
Answer: Acetaminophen is used to treat atopic dermatitis (dermatitis) in adults. Acetaminophen is used to treat atopic dermatitis (dermatitis) in adults. Acetaminophen is used to treat atopic dermatitis (dermatitis) in adults. Acetaminophen is used to treat 

### Inference Test

In [None]:
def test_inference(model, tokenizer, model_type):
	print("Testing inference...")

	test_questions = [
		"What is the dosage of Acetaminophen for adults?",
		"Who should get Pegfilgrastim Injection and why is it prescribed ?",
		"What is the dosage of Paracetamol for adults?",
		"How should Trospium be used and what is the dosage ?",
		"What are the side effects of Ibuprofen?",
		"When should I not take Amoxicillin?",
	]

	model.eval()
	for question in test_questions:
		print(f"Question: {question}")
		if model_type == 'seq2seq':
			# T5 style
			prompt = f"answer the following medication question:\n{question}"
			inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
			outputs = model.generate(
				**inputs, 
				max_length=256,
				num_beams=4,
				repetition_penalty=1.2,
				no_repeat_ngram_size=3,
				early_stopping=True,
			)
			answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
		else:
			# Causal LM style
			prompt = f"Instruction: {question}\n\nAnswer:"
			inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
			outputs = model.generate(
					**inputs,
					max_length=512,
					temperature=0.7,
					do_sample=True,
					pad_token_id=tokenizer.pad_token_id
			)
			answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
			# Extract only the answer part
			answer = answer.split("Answer:")[-1].strip()
		
		print(f"Answer: {answer}\n")

In [None]:
test_inference(model, tokenizer, model_type)