In [None]:
# Colab Cell 1: Install Unsloth & QLoRA Dependencies (quiet mode)

# 1. Upgrade pip quietly
!pip install -q --upgrade pip

# 2. Install Unsloth with Colab extras quietly
!pip install -q --upgrade unsloth

# 3. Install core libraries without re-pulling extras, in quiet mode
!pip install -q --no-deps \
    peft>=0.7.0 \
    trl>=0.4.0 \
    accelerate>=0.25.0 \
    datasets>=2.15.0 \
    scipy>=1.11.0 \
    huggingface_hub>=0.19.0


In [None]:
# Colab Cell 2: Hugging Face Login

from huggingface_hub import login
login()  # follow the popup to enter your token securely


In [None]:
# Colab Cell 3: Load,ize, Inject LoRA & Train with Unsloth on T4 GPU

import torch
import json
import os
import gc
from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
import psutil

# ── Configuration ───────────────────────────────────────────────────────────
MODEL_NAME   = "Qwen/Qwen2.5-3B-Instruct"
OUTPUT_DIR   = "qwen-algospeak-unsloth"
DATASET_FILE = "training_dataset_colab.json"
MAX_SEQ_LEN  = 512

# ── 1. Load & Inspect Dataset ───────────────────────────────────────────────
def load_large_dataset(path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Dataset not found: {path}")
    with open(path, "r") as f:
        data = json.load(f)
    print(f"✅ Loaded {len(data):,} samples")
    algospeak = sum(1 for x in data if x.get("is_algospeak", False))
    print(f"📊 Algospeak samples: {algospeak:,} ({algospeak/len(data)*100:.1f}%)")
    return data

training_data = load_large_dataset(DATASET_FILE)

# ── 2. 4-bit Model Load via Unsloth ──────────────────────────────────────────
print("🤖 Loading & quantizing model in 4-bit mode…")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    load_in_4bit=True,
    max_seq_length=MAX_SEQ_LEN,
    device_map="auto",
    trust_remote_code=True,
)

# ── 3. Inject LoRA Adapters ─────────────────────────────────────────────────
print("🔧 Applying LoRA adapters…")
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj","k_proj","v_proj","o_proj",
        "gate_proj","up_proj","down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
)
model.print_trainable_parameters()

# ── 4. Prepare & Tokenize Dataset ──────────────────────────────────────────
def format_prompt(sample):
    return f"""### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}"""

texts = [format_prompt(s) for s in training_data]
dataset = Dataset.from_dict({"text": texts}).train_test_split(test_size=0.1, seed=42)

# Cleanup large lists
del training_data, texts
gc.collect()
torch.cuda.empty_cache()

def tokenize_fn(examples):
    out = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=MAX_SEQ_LEN,
    )
    out["labels"] = out["input_ids"].copy()
    return out

print("🔤 Tokenizing dataset…")
tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"],
    batch_size=1000,
)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)

# ── 5. TrainingArguments & Trainer Setup ────────────────────────────────────
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    remove_unused_columns=False,
    gradient_checkpointing=True,
    dataloader_num_workers=2,
    # report_to="none",           # UNCOMMENT this line to disable WandB tracking
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    args=training_args,
    data_collator=data_collator,
)

# ── 6. Start Training ───────────────────────────────────────────────────────
print("🚀 Starting QLoRA training on massive Algospeak dataset…")
trainer.train()


In [None]:
from google.colab import files
import subprocess, os

ADAPTER_DIR      = "qwen-algospeak-unsloth"
MERGED_DIR       = f"{ADAPTER_DIR}_merged"
ADAPTER_ZIP      = "adapters.zip"
MERGED_MODEL_ZIP = "merged_model.zip"

# Zip adapters
if os.path.isdir(ADAPTER_DIR):
    print(f"📦 Zipping {ADAPTER_DIR} → {ADAPTER_ZIP}")
    subprocess.run(["zip","-r",ADAPTER_ZIP,ADAPTER_DIR], check=True)
else:
    raise FileNotFoundError(ADAPTER_DIR)

# Zip merged model (if exists)
if os.path.isdir(MERGED_DIR):
    print(f"📦 Zipping {MERGED_DIR} → {MERGED_MODEL_ZIP}")
    subprocess.run(["zip","-r",MERGED_MODEL_ZIP,MERGED_DIR], check=True)

# Download
print("⬇️ Downloading…")
files.download(ADAPTER_ZIP)
if os.path.exists(MERGED_MODEL_ZIP):
    files.download(MERGED_MODEL_ZIP)
print("✅ Step 4 complete!")
