<a href="https://colab.research.google.com/github/theprashasst/Sushruta-P3.8Q/blob/main/Quantization%20and%20LoRA%20Finetuning%20(QLoRA).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ====================================================================================
# FINAL SCRIPT: Fine-Tuning Sushruta-P3.8Q with QLoRA and W&B Integration
#
# Creator: Prashasst Dongre, Prashasst's AI Labs
# Timestamp: June 22, 2025
#
# This script does everything: installs dependencies, logs in, configures the
# model and persona, loads and prepares the dataset, and starts the training.
# ====================================================================================


In [None]:
# --- Step 0: Install All Required Libraries ---
print("--- [0/7] Installing required libraries... ---")
# First, install a stable and compatible PyTorch ecosystem for Colab's CUDA version
!pip install -q -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
# Now, install all other necessary ML libraries
!pip install -q -U transformers accelerate bitsandbytes peft trl datasets wandb
print("✅ All libraries installed successfully.")

In [None]:
# --- Step 1: Import Libraries & Log In ---
print("\n--- [1/7] Importing libraries & Logging In... ---")
import torch
import os
import time
import wandb
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from huggingface_hub import login

from google.colab import userdata
HF_TOKEN=userdata.get('HF_TOKEN')
WNB=userdata.get('WNB')
print("✅ Libraries imported successfully.")




# Log in to Hugging Face Hub to download the model
print("--> Please log in to your Hugging Face account:")
login(token=HF_TOKEN)

# Log in to Weights & Biases to track the experiment
print("\n--> Please log in to your Weights & Biases account:")
wandb.login(key=WNB)
print("✅ Logins successful.")

In [None]:
# --- Step 2: Define All Configurations ---
print("\n--- [2/7] Setting up all configurations... ---")

# W&B Project Name
os.environ["WANDB_PROJECT"] = "Sushruta-P3.8Q-Finetune"

# Persona and Identity Configuration
GUIDELINES = """
# IDENTITY AND ORIGIN
You are Sushruta-P3.8Q, a specialized medical large language Model from Prashasst's AI Labs. You were created and fine-tuned by Prashasst Dongre to serve as a reliable and accessible educational tool for the public.
# CORE MISSION
Your primary purpose is to help any user understand complex medical topics by providing clear, logical, step-by-step analyses.
# STRICT OPERATIONAL GUIDELINES
1.  Greet users politely and neutrally.
2.  If asked about your origin, state you were created by Prashasst Dongre at Prashasst's AI Labs.
3.  Always follow the Chain-of-Thought format: <think>...</think> followed by <solution>...</solution>.
4.  Politely refuse non-medical questions, stating that your function is specialized.
5.  Frame all responses as educational explanations, not direct medical advice.
# PERSONA AND TONE
- **Tone:** Your tone must be professional, empathetic, precise, and trustworthy.
- **Persona:** You are a public-facing specialist: knowledgeable, safe, and helpful to all users.
"""

# Model and Quantization Configuration
model_id = "microsoft/Phi-3-mini-4k-instruct"

quantization_config = BitsAndBytesConfig(
load_in_4bit=True,                   # This is the master switch to enable 4-bit quantization
bnb_4bit_quant_type="nf4",           # nf4 is a special 4-bit data type designed for neural networks
bnb_4bit_compute_dtype=torch.bfloat16, # The compute type is bfloat16 for speed and stability
bnb_4bit_use_double_quant=True,      # A memory-saving technique within quantization

)


In [None]:
# --- Step 3: Load Base Model & Tokenizer ---
print("\n--- [3/7] Loading base model and tokenizer... ---")
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True,
    low_cpu_mem_usage=True
)
print("✅ Base model and tokenizer loaded.")
# Get the memory footprint in bytes and convert to gigabytes
memory_bytes = model.get_memory_footprint()
memory_gb = memory_bytes / (1024**3)

print(f"\nMemory footprint of the 4-bit model: {memory_gb:.2f} GB")
print("✅ Configurations are set.")

In [None]:
# --- Step 4: Load and Prepare the Dataset ---
print("\n--- [4/7] Loading and reformatting the dataset... ---")
dataset = load_dataset("FreedomIntelligence/Medical-R1-Distill-Data", split="train")

def reformat_dataset(example):
    question = example['question']
    thinking_process = example['reasoning (reasoning_content)']
    final_answer = example['response (content)']
    answer = f"<think>{thinking_process}</think><solution>{final_answer}</solution>"
    return {"question": question, "answer": answer}

formatted_dataset = dataset.map(reformat_dataset)
print("✅ Dataset loaded and formatted.")


In [None]:
# --- Step 5.1: Filter Long Sequences (NEW STEP) ---
print("\n--- [5/8] Filtering out overly long examples... ---")
# This function tokenizes the full prompt and checks its length
def filter_long_examples(example):
    prompt = f"<|system|>\n{GUIDELINES}<|end|>\n<|user|>\n{example['question']}<|end|>\n<|assistant|>\n{example['answer']}<|end|>"
    # The tokenizer returns a dictionary with 'input_ids', we check the length of that list
    return len(tokenizer(prompt)['input_ids']) <= 4096

final_dataset = formatted_dataset.filter(filter_long_examples)
print(f"Original dataset size: {len(formatted_dataset)}")
print(f"Filtered dataset size: {len(final_dataset)}")
print("✅ Long examples removed.")

In [None]:

# --- Step 5: Configure LoRA Adapters ---
print("\n--- [5/7] Configuring LoRA adapters... ---")
model.config.use_cache = False
peft_model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
peft_model = get_peft_model(peft_model, lora_config)
print("✅ LoRA adapters configured and applied.")

In [None]:

# --- Step 6: Configure the SFT Trainer ---
print("\n--- [6/7] Configuring the Trainer... ---")
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/Sushruta-P3.8Q-Finetune", # <-- Saves to your Google Drive!
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_steps=200,      # Train for 200 steps for a solid result. Increase later for higher quality.
    logging_steps=10,    # Log to W&B every 10 steps
    save_steps=100,      # Save a checkpoint every 100 steps
    fp16=True,           # Use mixed precision for speed
    report_to="wandb",   # Enable W&B logging
    run_name=f"Sushruta-P3.8Q-{int(time.time())}" # Descriptive run name
)

trainer = SFTTrainer(
    model=peft_model,
    train_dataset=formatted_dataset,
    peft_config=lora_config,
    # max_seq_length=2048, # Increased for the detailed medical dataset
    # tokenizer=tokenizer,
    args=training_args,
    formatting_func=lambda example: f"<|system|>\n{GUIDELINES}<|end|>\n<|user|>\n{example['question']}<|end|>\n<|assistant|>\n{example['answer']}<|end|>"
)
print("✅ Trainer is ready.")


In [None]:


# --- Step 7: Start Fine-Tuning ---
print("\n--- [7/7] Starting the fine-tuning process! ---")
print("This will take a while. A link to your W&B dashboard will appear below.")
trainer.train()

print("\n\n--- 🎉 TRAINING COMPLETE! 🎉 ---")
wandb.finish() # Properly finish the W&B run
trainer.save_model("./Sushruta-P3.8Q-final-adapters")
print("Final model adapters have been saved to './Sushruta-P3.8Q-final-adapters'")