<a href="https://colab.research.google.com/github/vardhin/WeLearn/blob/master/finetune_gemma3_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import torch
from datasets import Dataset, load_dataset
import os
import subprocess

# --- Colab Specific Setup ---

def check_gpu_compatibility():
    """
    Checks for a compatible NVIDIA GPU and exits if one is not found.
    """
    if not torch.cuda.is_available():
        print("\n\n" + "="*80)
        print("❌ ERROR: No NVIDIA GPU detected. Unsloth requires a GPU.")
        print("Please change your runtime type to a GPU.")
        print("Go to 'Runtime' -> 'Change runtime type' -> 'Hardware accelerator' and select 'T4 GPU' or another GPU.")
        print("="*80 + "\n\n")
        os.kill(os.getpid(), 9)
    else:
        print("✅ Compatible NVIDIA GPU detected.")

def check_and_install_dependencies():
    """
    Checks if unsloth is installed. If not, it installs all required dependencies
    and then stops execution to allow for a manual runtime restart.
    """
    try:
        import unsloth
        print("✅ Dependencies are already installed.")
    except ImportError:
        print("Installing necessary packages. This may take a few minutes...")

        try:
            # Using subprocess to handle installation quietly
            # Unsloth's Gemma-3 support requires newer versions of dependencies
            subprocess.run(["pip", "install", "unsloth[colab-new]"], check=True, capture_output=True)
            subprocess.run(["pip", "install", "--no-deps", "trl", "peft", "accelerate", "bitsandbytes"], check=True, capture_output=True)
            print("✅ Installation successful.")

        except subprocess.CalledProcessError as e:
            print(f"❌ An error occurred during installation: {e}")
            print(e.stderr.decode())
            # Exit if installation fails
            exit()

        # Stop execution to force user to restart runtime
        print("\n\n" + "="*80)
        print("IMPORTANT: Dependencies installed. You MUST restart the runtime now.")
        print("Go to 'Runtime' -> 'Restart runtime' in the menu, and then run this cell again.")
        print("="*80 + "\n\n")
        os.kill(os.getpid(), 9)

def mount_google_drive():
    """Mounts Google Drive to the Colab environment."""
    print("Mounting Google Drive...")
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        print("✅ Google Drive mounted successfully at /content/drive.")
    except ImportError:
        print("This script is designed to run in Google Colab. Could not find google.colab library.")
    except Exception as e:
        print(f"An error occurred while mounting Google Drive: {e}")
        exit()

# Run setup tasks first
check_gpu_compatibility()
check_and_install_dependencies()
mount_google_drive()

# Now that setup is done, we can import the heavy libraries
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer
from transformers import TrainingArguments

# --- Core Functions ---

def load_jsonl_dataset(file_path):
    """Load and prepare dataset from JSONL file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Dataset file not found at: {file_path}. Please ensure the file exists in your Google Drive.")
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

def format_conversation_structure(example):
    """
    Formats an example from various known structures into a unified
    list of messages format. This format is compatible with apply_chat_template.
    """
    messages = []
    if 'instruction' in example and 'output' in example:
        # Alpaca-style format
        messages.append({"role": "user", "content": example['instruction']})
        messages.append({"role": "assistant", "content": example['output']})
    elif 'conversations' in example:
        # ShareGPT-style format
        for message in example['conversations']:
            role = "user" if message.get('from') == 'human' else "assistant"
            content = message.get('value', '')
            messages.append({"role": role, "content": content})
    # Add more format checks here if needed
    else:
        raise ValueError("Dataset format not recognized. Expected 'instruction'+'output' or 'conversations' format.")

    return {"messages": messages}

def main():
    # --- Configuration ---
    # ⭐️ MODIFIED: Updated model name to Gemma-3 270M
    model_name = "unsloth/gemma-3-270m-it-bnb-4bit"
    # ⚠️ IMPORTANT: Update this path to point to your training data in Google Drive
    jsonl_file_path = "/content/drive/MyDrive/MLjournal/trainingA.jsonl"
    # ⭐️ MODIFIED: Updated output directory for the new model
    output_dir = "/content/drive/MyDrive/gemma3-270m-finetuned"
    max_seq_length = 2048

    # --- Model and Tokenizer Loading ---
    print("Loading model and tokenizer...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=None,      # Auto-detect
        load_in_4bit=True, # This enables QLoRA quantization
    )

    # --- ⭐️ CRITICAL FOR GEMMA-3: Apply the correct chat template ---
    print("Applying Gemma-3 chat template...")
    tokenizer = get_chat_template(
        tokenizer,
        chat_template="gemma3",
    )

    # --- LoRA Configuration ---
    print("Configuring LoRA...")
    model = FastLanguageModel.get_peft_model(
        model,
        # ⭐️ MODIFIED: Increased rank and alpha for potentially better performance
        r=128,
        lora_alpha=128,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_dropout=0.1,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    # --- Dataset Preparation ---
    print(f"Loading and preparing dataset from {jsonl_file_path}...")
    raw_data = load_jsonl_dataset(jsonl_file_path)
    dataset = Dataset.from_list(raw_data)

    # 1. First, map to create the 'messages' column with a standardized conversation structure
    structured_dataset = dataset.map(format_conversation_structure, remove_columns=list(dataset.features))

    # --- Dataset Splitting ---
    if len(structured_dataset) > 100:
        train_test_split = structured_dataset.train_test_split(test_size=0.1)
        train_dataset = train_test_split['train']
        eval_dataset = train_test_split['test']
        print(f"Dataset split into {len(train_dataset)} training and {len(eval_dataset)} evaluation samples.")
    else:
        train_dataset = structured_dataset
        eval_dataset = None
        print(f"Using the full dataset of {len(train_dataset)} samples for training.")


    # 2. Define a formatting function that uses the tokenizer to create the final training string.
    def formatting_prompts_func(example):
        # The .removeprefix('<bos>') is a recommended practice from the Unsloth Gemma-3 notebook
        text = tokenizer.apply_chat_template(example["messages"], tokenize=False, add_generation_prompt=False).removeprefix('<bos>')
        return {"text": text}

    # 3. Map this function to create the final 'text' column that the SFTTrainer will use
    train_dataset = train_dataset.map(formatting_prompts_func)
    if eval_dataset:
        eval_dataset = eval_dataset.map(formatting_prompts_func)

    # --- Training Arguments ---
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=8,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=100,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=2,
        eval_strategy="steps" if eval_dataset else "no",
        eval_steps=500 if eval_dataset else None,
        load_best_model_at_end=True if eval_dataset else False,
        report_to="none",  # Disable wandb logging
    )

    # --- Initialize Trainer ---
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text", # **CRITICAL**: Point to the new 'text' column
        max_seq_length=max_seq_length,
        args=training_args,
        dataset_num_proc=2, # Enable multiprocessing for faster data processing
    )

    # --- Start Training ---
    print("Starting training... 🚀")
    trainer.train()

    # --- Save Final Model ---
    print(f"Saving final LoRA adapters to {output_dir}...")
    trainer.save_model()
    tokenizer.save_pretrained(output_dir)

    # --- Save to GGUF Format (Optional) ---
    print("Attempting to save model in GGUF format...")
    try:
        # ⭐️ MODIFIED: Using `quantization_type` as per the latest Unsloth API for GGUF
        model.save_pretrained_gguf(output_dir, tokenizer, quantization_type="Q8_0")
        print("✅ GGUF model (Q8_0) saved successfully.")
    except Exception as e:
        print(f"Could not save GGUF model. This might be due to model compatibility or library versions. Error: {e}")

    print(f"\n🎉 Training completed! Model and tokenizer saved to {output_dir}")

if __name__ == "__main__":
    # Start the main training process
    main()

✅ Compatible NVIDIA GPU detected.
✅ Dependencies are already installed.
Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted successfully at /content/drive.
Loading model and tokenizer...
==((====))==  Unsloth 2025.9.6: Fast Gemma3_Text patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3_text won't work! Using float32.
Applying Gemma-3 chat template...
Configuring LoRA...
Unsloth: Making `model.base_model.model.model` require gradients
Loading and preparing dataset f

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Dataset split into 990 training and 111 evaluation samples.


Map:   0%|          | 0/990 [00:00<?, ? examples/s]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

Unsloth: Switching to float32 training since model cannot work with float16


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/990 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/111 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 990 | Num Epochs = 8 | Total steps = 992
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 30,375,936 of 298,474,112 (10.18% trained)


Starting training... 🚀


Step,Training Loss,Validation Loss
500,1.3571,2.901191


Unsloth: Not an error, but Gemma3ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Saving final LoRA adapters to /content/drive/MyDrive/gemma3-270m-finetuned...
Attempting to save model in GGUF format...
Unsloth: Updating system package directories
Unsloth: Install GGUF and other packages
Could not save GGUF model. This might be due to model compatibility or library versions. Error: Unsloth: `config.json` does not exist inside `/content/drive/MyDrive/gemma3-270m-finetuned`.

🎉 Training completed! Model and tokenizer saved to /content/drive/MyDrive/gemma3-270m-finetuned


In [1]:
import json
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
import gc

# --- Configuration ---
# MODIFIED: Switched the model to a 4bit-quantized Gemma 2B Instruct model
base_model_name = "unsloth/gemma-3-270m-it-bnb-4bit"
jsonl_file_path = "/content/drive/MyDrive/MLjournal/trainingA.jsonl" # ⚠️ Make sure this path is correct
NUM_TEST_QUESTIONS = 5

# --- 1. Load Dataset and Create a Sample ---
print("Loading dataset and preparing sample...")
try:
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    dataset = Dataset.from_list(data)
    sample_dataset = dataset.select(range(NUM_TEST_QUESTIONS))
except FileNotFoundError:
    print(f"❌ ERROR: Dataset not found at '{jsonl_file_path}'. Please check the file path.")
    exit()


# --- 2. Load the Base Model ---
print(f"Loading base model: {base_model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    dtype=None,      # Let Unsloth auto-detect
    load_in_4bit=True,
)


# --- 3. Generate and Print Responses ---
print(f"\n--- Generating responses from BASE MODEL ({base_model_name}) ---")
for item in sample_dataset:
    # This logic correctly handles both 'instruction' and 'conversations' formats
    prompt = ""
    original_output = ""
    if 'instruction' in item:
        prompt = item.get('instruction', '')
        original_output = item.get('output', '')
    elif 'conversations' in item:
        if len(item['conversations']) > 0 and item['conversations'][0]['from'] == 'human':
            prompt = item['conversations'][0]['value']
            if len(item['conversations']) > 1:
                original_output = item['conversations'][1]['value']

    if not prompt:
        print("Skipping item with no valid prompt.")
        continue

    # Format the prompt for inference. Unsloth's FastLanguageModel configures the
    # tokenizer to use the correct chat template for Gemma automatically.
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    # Generate the response
    outputs = model.generate(inputs, max_new_tokens=128, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]

    # Print the comparison
    print(f"❓ PROMPT:\n{prompt}")
    print("-------------------------------------------------")
    print(f"🤖 BASE MODEL RESPONSE:\n{response}")
    print("-------------------------------------------------")
    print(f"✅ ORIGINAL RESPONSE:\n{original_output}")
    print("=================================================\n")


# --- 4. IMPORTANT: Clear memory ---
print("Clearing base model from memory...")
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
print("Memory cleared.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading dataset and preparing sample...
Loading base model: unsloth/gemma-3-270m-it-bnb-4bit...
==((====))==  Unsloth 2025.9.6: Fast Gemma3_Text patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3_text won't work! Using float32.


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Generating responses from BASE MODEL (unsloth/gemma-3-270m-it-bnb-4bit) ---
❓ PROMPT:
What are the three anomalies observed in the Mixture-of-Experts (MoE) paradigm at Kuaishou that negatively impact model performance?
-------------------------------------------------
🤖 BASE MODEL RESPONSE:
The three anomalies observed in the Mixture-of-Experts (MoE) paradigm at Kuaishou are:

1.  **Data Type Assumption:**  The core anomaly arises from the assumption of a mixture of experts (i.e., users, the models, and the model's behavior) as being *only* based on the expert's suggestions.  The assumption of a single expert's suggestion is a major problem.  Specifically, the suggestion for the experts is often not explicitly stated in the input data. This leads to the following:

    *   **Reduced Model Capacity:** The models trained on the
-------------------------------------------------
✅ ORIGINAL RESPONSE:
The three anomalies observed are: (1) Expert Collapse, where some experts have signifi

In [2]:
import json
import torch
from datasets import Dataset
from unsloth import FastLanguageModel
import gc

# --- Configuration ---
# The base model you used for fine-tuning
base_model_name = "unsloth/gemma-3-270m-it-bnb-4bit"
# The path to your saved adapter (the result of your fine-tuning)
# ⚠️ Make sure this adapter was trained on the 'base_model_name' above.
adapter_path = "/content/drive/MyDrive/gemma3-270m-finetuned"
# The path to your dataset for testing
jsonl_file_path = "/content/drive/MyDrive/MLjournal/trainingA.jsonl"
NUM_TEST_QUESTIONS = 5

# --- 1. Load Dataset and Create a Sample ---
print("Loading dataset and preparing sample...")
try:
    with open(jsonl_file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    dataset = Dataset.from_list(data)
    sample_dataset = dataset.select(range(NUM_TEST_QUESTIONS))
    print(f"✅ Loaded {len(dataset)} records. Using {NUM_TEST_QUESTIONS} for testing.")
except FileNotFoundError:
    print(f"❌ ERROR: Dataset not found at '{jsonl_file_path}'. Please check the file path.")
    exit()

# --- 2. Load the Base Model ---
print(f"\nLoading base model: {base_model_name}...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    dtype=None,
    load_in_4bit=True,
)
print("✅ Base model loaded.")

# --- 3. Generate and Print Responses from BASE MODEL ---
print(f"\n--- Generating responses from BASE MODEL ({base_model_name}) ---")
for item in sample_dataset:
    # This logic handles both 'instruction' and 'conversations' formats
    prompt = ""
    original_output = ""
    if 'instruction' in item:
        prompt = item.get('instruction', '')
        original_output = item.get('output', '')
    elif 'conversations' in item:
        if len(item['conversations']) > 0 and item['conversations'][0]['from'] == 'human':
            prompt = item['conversations'][0]['value']
            if len(item['conversations']) > 1:
                original_output = item['conversations'][1]['value']

    if not prompt:
        print("Skipping item with no valid prompt.")
        continue

    # Format the prompt for inference
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    # Generate the response
    outputs = model.generate(inputs, max_new_tokens=128, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]

    # Print the comparison
    print(f"\n❓ PROMPT:\n{prompt}")
    print("-------------------------------------------------")
    print(f"🤖 BASE MODEL RESPONSE:\n{response}")
    print("-------------------------------------------------")
    print(f"✅ ORIGINAL RESPONSE:\n{original_output}")
    print("=================================================")

# --- 4. IMPORTANT: Clear Base Model from Memory ---
print("\nClearing base model from memory...")
del model
gc.collect()
torch.cuda.empty_cache()
print("✅ Memory cleared.")

# --- 5. Load the Fine-Tuned Model (Base Model + Adapter) ---
print(f"\nLoading fine-tuned model from: {adapter_path}...")
ft_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=adapter_path, # ⬅️ Loading your adapter
    dtype=None,
    load_in_4bit=True,
)
print("✅ Fine-tuned model loaded.")

# --- 6. Generate and Print Responses from FINE-TUNED MODEL ---
print("\n--- Generating responses from FINE-TUNED MODEL ---")
for item in sample_dataset: # Using the SAME sample_dataset for a fair comparison
    # Logic to extract prompt and original output
    prompt = ""
    original_output = ""
    if 'instruction' in item:
        prompt = item.get('instruction', '')
        original_output = item.get('output', '')
    elif 'conversations' in item:
        if len(item['conversations']) > 0 and item['conversations'][0]['from'] == 'human':
            prompt = item['conversations'][0]['value']
            if len(item['conversations']) > 1:
                original_output = item['conversations'][1]['value']

    if not prompt:
        print("Skipping item with no valid prompt.")
        continue

    # Format the prompt for inference
    messages = [{"role": "user", "content": prompt}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")

    # Generate the response from the fine-tuned model
    outputs = ft_model.generate(inputs, max_new_tokens=128, use_cache=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.batch_decode(outputs[:, inputs.shape[1]:], skip_special_tokens=True)[0]

    # Print the comparison
    print(f"\n❓ PROMPT:\n{prompt}")
    print("-------------------------------------------------")
    print(f"🚀 FINE-TUNED MODEL RESPONSE:\n{response}")
    print("-------------------------------------------------")
    print(f"✅ ORIGINAL RESPONSE:\n{original_output}")
    print("=================================================")

# --- 7. Final Cleanup ---
print("\nClearing fine-tuned model from memory...")
del ft_model
del tokenizer
gc.collect()
torch.cuda.empty_cache()
print("✅ All done. Memory cleared.")

Loading dataset and preparing sample...
✅ Loaded 1101 records. Using 5 for testing.

Loading base model: unsloth/gemma-3-270m-it-bnb-4bit...
==((====))==  Unsloth 2025.9.6: Fast Gemma3_Text patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3_text won't work! Using float32.
✅ Base model loaded.

--- Generating responses from BASE MODEL (unsloth/gemma-3-270m-it-bnb-4bit) ---

❓ PROMPT:
What are the three anomalies observed in the Mixture-of-Experts (MoE) paradigm at Kuaishou that negatively impact model performance?
-------------------------------------------------
🤖 BASE MODEL RESPONSE:
Th