<a href="https://colab.research.google.com/github/tam1444AH/COSC4397Project/blob/main/notebooks/supervised-data-preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install -U hf_transfer
!export HF_HUB_ENABLE_HF_TRANSFER=1

import os
from google.colab import userdata
from huggingface_hub import login, whoami
import wandb

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

In [None]:
!pip -q install -U trl datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
from peft import LoraConfig, TaskType

In [None]:
from pathlib import Path
import json, random
from collections import defaultdict
random.seed(4371)

dataset_path = Path("/content/test.jsonl") # This will be our raw dataset.
rows = [json.loads(line) for line in dataset_path.read_text(encoding="utf-8").splitlines() if line.strip()]

supervised_rows = [row for row in rows if row.get("set") == "supervised"]
print(f"Total rows: {len(rows)}, Supervised rows: {len(supervised_rows)}")

by_filetype = defaultdict(list)

for row in supervised_rows:
  filetype = row.get("filetype")
  by_filetype[filetype].append(row)

train, val = [], []

for filetype, supervised_rows in by_filetype.items():
  random.shuffle(supervised_rows)
  cut = max(1, int(0.95 * len(supervised_rows)))
  train.extend(supervised_rows[:cut])
  val.extend(supervised_rows[cut:])

random.shuffle(train)
random.shuffle(val)

train_path = Path("/content/train.jsonl")
val_path = Path("/content/val.jsonl")

train_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in train), encoding="utf-8")
val_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in val), encoding="utf-8")

print(f"\nFinal split - Train: {len(train)}, Val: {len(val)}")
print(f"Saved to {train_path} and {val_path}")


In [None]:
model_name = "Qwen/Qwen3-32B-Coder-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

train_dataset = load_dataset("json", data_files="/content/train.jsonl", split="train")
val_dataset = load_dataset("json", data_files="/content/val.jsonl", split="train")

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/sft_output",
    num_train_epochs=1,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_ratio=0.03,
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    gradient_checkpointing=True,
    report_to="wandb",
    push_to_hub=True,
    hub_model_id="",
    hub_token=HF_TOKEN,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    max_seq_length=4096,
)

In [None]:
# Train
trainer.train()

In [None]:
# Save and push
trainer.save_model("/content/sft_output/final")
trainer.push_to_hub()