In [8]:
import sys
import os
import warnings
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import json
import torch

warnings.filterwarnings('ignore')

# Add project root to Python path
project_root = Path.cwd()
if project_root.name == "notebooks":
    project_root = project_root.parent

src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")
print(f"Source path: {src_path}")
print(f"Python path updated")

from config import config, MedicalLLMConfig
from model_setup import ModelManager, get_model_memory_usage
from data_loader import MedicalDataLoader
from trainer import MedicalLLMTrainer, setup_training_environment
from evaluator import MedicalLLMEvaluator

plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Working directory: {Path.cwd()}")
print(f"Python version: {sys.version}")

if torch.cuda.is_available():
    print(f"CUDA available: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
else:
    print("CUDA not available - training will be slow")


Project root: c:\Users\Siu856569517\Taminul\GenAI_LLM
Source path: c:\Users\Siu856569517\Taminul\GenAI_LLM\src
Python path updated
Libraries imported successfully!
Working directory: c:\Users\Siu856569517\Taminul\GenAI_LLM\notebooks
Python version: 3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
CUDA available: NVIDIA GeForce RTX 3090
GPU Memory: 24.0GB


In [3]:
print("Setting up Medical LLM Training Environment...")
environment_ready = setup_training_environment()

if environment_ready:
    print("Environment setup completed successfully!")
else:
    print("Environment setup failed!")
    
print("\nCurrent Configuration:")
print("=" * 50)
print(f"Base Model: {config.model.base_model_name}")
print(f"Training Epochs: {config.training.num_train_epochs}")
print(f"Batch Size: {config.training.per_device_train_batch_size}")
print(f"Learning Rate: {config.training.learning_rate}")
print(f"LoRA Rank (r): {config.lora.r}")
print(f"LoRA Alpha: {config.lora.lora_alpha}")
print(f"Max Sequence Length: {config.training.max_seq_length}")
print(f"Use 4-bit Quantization: {config.model.load_in_4bit}")
print(f"Use FP16: {config.training.fp16}")
print("=" * 50)


INFO:trainer:Setting up Medical LLM Training Environment...
INFO:trainer:✅ CUDA Device: NVIDIA GeForce RTX 3090 (24.0GB)
INFO:trainer:✅ All required packages imported successfully
INFO:trainer:✅ Training environment ready!


Setting up Medical LLM Training Environment...
Environment setup completed successfully!

Current Configuration:
Base Model: microsoft/DialoGPT-small
Training Epochs: 2
Batch Size: 2
Learning Rate: 0.0002
LoRA Rank (r): 32
LoRA Alpha: 16
Max Sequence Length: 512
Use 4-bit Quantization: True
Use FP16: True


In [9]:
data_loader = MedicalDataLoader()

print("Loading and preparing medical datasets...")

config.data.use_dummy_data = False

data_loader.load_medical_dataset()
data_loader.preprocess_dataset()

print("Dataset prepared successfully!")
print(f"Total samples: {len(data_loader.processed_dataset)}")
print(f"Sample format: {list(data_loader.processed_dataset.features.keys())}")

print("\nSample Data Examples:")
print("=" * 60)
for i in range(min(3, len(data_loader.processed_dataset))):
    sample = data_loader.processed_dataset[i]
    text = sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text']
    print(f"Sample {i+1}:")
    print(f"{text}")
    print("-" * 40)


INFO:data_loader:Loading real medical datasets...


Loading and preparing medical datasets...


INFO:data_loader:📚 Dataset loaded with 10000 samples
INFO:data_loader:🔄 Preprocessing dataset...
INFO:data_loader:✅ Dataset preprocessing completed


Dataset prepared successfully!
Total samples: 10000
Sample format: ['instruction', 'input', 'output', '__index_level_0__', 'text', 'prompt', 'completion']

Sample Data Examples:
Sample 1:
Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: hi. im a home health aide and i have a client with scoliosis in the back and kidney dis...
----------------------------------------
Sample 2:
Instruction: Please summerize the given abstract to a title
Input: RATIONALE: The COVID-19 pandemic struck an immunologically naïve, globally interconnected population. In the face of a new infectious...
----------------------------------------
Sample 3:
Instruction: Please summerize the given abstract to a title
Input: Objectives: To investigate the experience of playing the harmonica for individuals with COPD. Methods: A qualitative, phenomenologica...
----------------------------------------


In [10]:
model_manager = ModelManager(config)

print("Setting up model and tokenizer...")
model_manager.setup_model_and_tokenizer()

print("Configuring LoRA adapters...")
model_manager.setup_lora_model()

model_info = model_manager.get_model_info()
print("\nModel setup completed!")
print(f"Base Model: {config.model.base_model_name}")
print(f"Model Type: {type(model_manager.model).__name__}")
print(f"Tokenizer Vocab Size: {len(model_manager.tokenizer)}")

print("\nLoRA Configuration:")
print(f"Rank (r): {config.lora.r}")
print(f"Alpha: {config.lora.lora_alpha}")
print(f"Dropout: {config.lora.lora_dropout}")
print(f"Target Modules: {config.lora.target_modules}")

total_params = sum(p.numel() for p in model_manager.model.parameters())
trainable_params = sum(p.numel() for p in model_manager.model.parameters() if p.requires_grad)

print("\nParameter Efficiency:")
print(f"Total Parameters: {total_params:,}")
print(f"Trainable Parameters: {trainable_params:,}")
print(f"Trainable %: {100 * trainable_params / total_params:.2f}%")

memory_usage = get_model_memory_usage()
print(f"\nGPU Memory Usage:")
if "error" not in memory_usage:
    print(f"Allocated: {memory_usage['allocated_gb']} GB")
    print(f"Total: {memory_usage['total_gb']} GB")
    print(f"Utilization: {memory_usage['utilization_percent']}%")
else:
    print(f"Memory info: {memory_usage['error']}")


INFO:model_setup:Loading model: microsoft/DialoGPT-small
INFO:model_setup:Set pad_token to eos_token


Setting up model and tokenizer...


INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
INFO:model_setup:✅ Model loaded successfully!
INFO:model_setup:Setting up LoRA configuration...
INFO:model_setup:📈 Trainable parameters: 129,158,400
INFO:model_setup:🔒 Total parameters: 129,158,400
INFO:model_setup:📊 Trainable %: 100.00%


Configuring LoRA adapters...

Model setup completed!
Base Model: microsoft/DialoGPT-small
Model Type: PeftModelForCausalLM
Tokenizer Vocab Size: 50257

LoRA Configuration:
Rank (r): 32
Alpha: 16
Dropout: 0.1
Target Modules: ['c_attn', 'c_proj', 'c_fc']

Parameter Efficiency:
Total Parameters: 86,691,072
Trainable Parameters: 4,718,592
Trainable %: 5.44%

GPU Memory Usage:
Allocated: 0.48 GB
Total: 24.0 GB
Utilization: 3.2%


In [12]:
config.training.num_train_epochs = 5
config.training.logging_steps = 50

print("Starting Medical LLM Training...")
print(f"Training Configuration:")
print(f"Epochs: {config.training.num_train_epochs}")
print(f"Batch Size: {config.training.per_device_train_batch_size}")
print(f"Learning Rate: {config.training.learning_rate}")

trainer = MedicalLLMTrainer(config)

experiment_name = f"notebook_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
experiment_dir = Path("../experiments") / experiment_name
experiment_dir.mkdir(parents=True, exist_ok=True)

print(f"\nExperiment Directory: {experiment_dir}")

print("\nTraining in progress...")
training_results = trainer.train(
    model_manager=model_manager,
    data_loader=data_loader,
    output_dir=str(experiment_dir)
)

print("\nTraining completed!")
print(f"Final Loss: {training_results['train_loss']:.4f}")
print(f"Training Steps: {training_results['train_steps']}")
print(f"Epochs Completed: {training_results['epochs_trained']}")
print(f"Model Saved: {training_results['final_model_path']}")

with open(experiment_dir / "notebook_results.json", 'w') as f:
    json.dump(training_results, f, indent=2, default=str)

print(f"\nResults saved to: {experiment_dir}")

final_model_path = training_results['final_model_path']


INFO:trainer:Starting Medical LLM Training Pipeline...
INFO:trainer:Training arguments configured for output: ..\experiments\notebook_training_20250723_094231


Starting Medical LLM Training...
Training Configuration:
Epochs: 5
Batch Size: 2
Learning Rate: 0.0002

Experiment Directory: ..\experiments\notebook_training_20250723_094231

Training in progress...


Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (5145 > 1024). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
INFO:trainer:SFTTrainer configured successfully
wandb: Currently logged in as: taminul to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


INFO:trainer:Starting training...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,10.4305
100,8.2522
150,5.8547
200,5.2888
250,4.9534
300,4.7622
350,4.5404
400,4.4961
450,4.2453
500,4.39


INFO:trainer:Saving trained model...
INFO:model_setup:💾 Model saved to ..\experiments\notebook_training_20250723_094231\final_model
INFO:trainer:Training completed! Results saved to: ..\experiments\notebook_training_20250723_094231
INFO:trainer:Final training loss: 3.8457



Training completed!
Final Loss: 3.8457
Training Steps: 6250
Epochs Completed: 5
Model Saved: ..\experiments\notebook_training_20250723_094231\final_model

Results saved to: ..\experiments\notebook_training_20250723_094231


In [13]:
evaluator = MedicalLLMEvaluator(config)

print("Starting comprehensive model evaluation...")
print(f"Model to evaluate: {final_model_path}")

evaluation_results = evaluator.run_comprehensive_evaluation(final_model_path)

print("\nEvaluation completed!")

summary = evaluation_results.get('summary', {})
print("\nEvaluation Summary:")
print("=" * 50)
print(f"Overall Accuracy: {summary.get('overall_accuracy', 0):.3f}")
print(f"Total Questions: {summary.get('total_questions', 0)}")
print(f"Correct Answers: {summary.get('total_correct', 0)}")
print(f"Benchmarks Evaluated: {summary.get('benchmarks_evaluated', 0)}")

benchmark_results = evaluation_results.get('benchmark_results', {})
print("\nBenchmark Performance:")
print("=" * 50)
for benchmark_name, results in benchmark_results.items():
    if 'error' not in results:
        accuracy = results.get('accuracy', 0)
        total_q = results.get('total_questions', 0)
        correct = results.get('correct_answers', 0)
        print(f"{benchmark_name}:")
        print(f"   Accuracy: {accuracy:.3f}")
        print(f"   Questions: {total_q}")
        print(f"   Correct: {correct}")
    else:
        print(f"{benchmark_name}: ERROR - {results['error']}")

report = evaluator.create_evaluation_report(evaluation_results)
print("\nDetailed Evaluation Report:")
print("=" * 60)
print(report)


Starting comprehensive model evaluation...
Model to evaluate: ..\experiments\notebook_training_20250723_094231\final_model


INFO:evaluator:🚀 Starting Comprehensive Medical LLM Evaluation...
INFO:evaluator:Loading model from: ..\experiments\notebook_training_20250723_094231\final_model
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
INFO:model_setup:✅ Trained model loaded from ..\experiments\notebook_training_20250723_094231\final_model
INFO:evaluator:Model ready for evaluation
INFO:evaluator:Loading medical benchmark datasets...
INFO:evaluator:Loading MedQA dataset...
INFO:evaluator:✅ MedQA loaded: 500 samples
INFO:evaluator:Loading PubMedQA dataset...
INFO:evaluator:Benchmark datasets loaded: ['medqa']
INFO:evaluator:Evaluating on medqa (500 samples)...
Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. D


Evaluation completed!

Evaluation Summary:
Overall Accuracy: 0.940
Total Questions: 50
Correct Answers: 47
Benchmarks Evaluated: 1

Benchmark Performance:
medqa:
   Accuracy: 0.940
   Questions: 50
   Correct: 47

Detailed Evaluation Report:
MEDICAL LLM EVALUATION REPORT
Model: microsoft/DialoGPT-small
Evaluation Date: 2025-07-23T10:57:57.036633

OVERALL RESULTS:
  Overall Accuracy: 0.940
  Total Questions: 50
  Correct Answers: 47
  Benchmarks Evaluated: 1

BENCHMARK DETAILS:
  medqa: 0.940 (47/50)



In [14]:
stats_data = {
    'Metric': [
        'Total Parameters',
        'Trainable Parameters', 
        'Parameter Efficiency (%)',
        'Overall Accuracy',
        'Total Questions Evaluated',
        'Correct Answers'
    ],
    'Value': [
        f"{total_params:,}",
        f"{trainable_params:,}",
        f"{100 * trainable_params / total_params:.2f}%",
        f"{summary.get('overall_accuracy', 0):.3f}",
        f"{summary.get('total_questions', 0)}",
        f"{summary.get('total_correct', 0)}"
    ]
}

stats_df = pd.DataFrame(stats_data)
print("Summary Statistics:")
print("=" * 40)
print(stats_df.to_string(index=False))

overall_accuracy = summary.get('overall_accuracy', 0)
if overall_accuracy >= 0.8:
    performance_level = "Excellent"
elif overall_accuracy >= 0.6:
    performance_level = "Good"
elif overall_accuracy >= 0.4:
    performance_level = "Fair"
else:
    performance_level = "Poor"

print(f"\nModel Performance Level: {performance_level}")
print(f"Accuracy: {overall_accuracy:.3f}")

improvement_over_random = overall_accuracy - 0.25
print(f"\nImprovement vs Random Baseline: +{improvement_over_random:.3f}")

print(f"\nExperiment completed successfully!")
print(f"Model saved at: {final_model_path}")


Summary Statistics:
                   Metric      Value
         Total Parameters 86,691,072
     Trainable Parameters  4,718,592
 Parameter Efficiency (%)      5.44%
         Overall Accuracy      0.940
Total Questions Evaluated         50
          Correct Answers         47

Model Performance Level: Excellent
Accuracy: 0.940

Improvement vs Random Baseline: +0.690

Experiment completed successfully!
Model saved at: ..\experiments\notebook_training_20250723_094231\final_model
