<a href="https://colab.research.google.com/github/tam1444AH/COSC4397Project/blob/main/notebooks/supervised-data-preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install -U hf_transfer
!export HF_HUB_ENABLE_HF_TRANSFER=1

import os
from google.colab import userdata
from huggingface_hub import login, whoami
import wandb

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

In [None]:
!pip -q install -U trl datasets

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset
from peft import LoraConfig, TaskType

wandb.init(
    project="qwen3coder-finetune-fp16-talha-v3-sft",
    name=f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    settings=wandb.Settings(
      ignore_globs=["*.bin","*.pt","*.safetensors","*.ckpt","checkpoint*"]
))

wandb.define_metric("train/global_step")
wandb.define_metric("train/*", step_metric="train/global_step")
wandb.define_metric("eval/*",  step_metric="train/global_step")

In [None]:
from pathlib import Path
import json, random
from collections import defaultdict
random.seed(4371)

dataset_path = Path("/content/test.jsonl") # This will be our raw dataset.
rows = [json.loads(line) for line in dataset_path.read_text(encoding="utf-8").splitlines() if line.strip()]

supervised_rows = [row for row in rows if row.get("set") == "supervised"]
print(f"Total rows: {len(rows)}, Supervised rows: {len(supervised_rows)}")

by_filetype = defaultdict(list)

for row in supervised_rows:
  filetype = row.get("filetype")
  by_filetype[filetype].append(row)

train, val = [], []

for filetype, supervised_rows in by_filetype.items():
  random.shuffle(supervised_rows)
  cut = max(1, int(0.95 * len(supervised_rows)))
  train.extend(supervised_rows[:cut])
  val.extend(supervised_rows[cut:])

random.shuffle(train)
random.shuffle(val)

train_path = Path("/content/train.jsonl")
val_path = Path("/content/val.jsonl")

train_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in train), encoding="utf-8")
val_path.write_text("\n".join(json.dumps(row, ensure_ascii=False) for row in val), encoding="utf-8")

print(f"\nFinal split - Train: {len(train)}, Val: {len(val)}")
print(f"Saved to {train_path} and {val_path}")


In [None]:
! pip install -U bitsandbytes

import torch
from tqdm import tqdm
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer

torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

model_name = "Qwen/Qwen3-32B-Coder-Instruct"
adapter_name = "tam2003/Qwen3-Coder-30b-v5-2ep"
output_dir = "Qwen3-Coder-30b-v5-2ep"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    padding_side="right",
    add_eos_token=True,
    add_bos_token=True,
    trust_remote_code=True,
    use_fast=False,
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

names = ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|fim_pad|>"]
ids = [tokenizer.convert_tokens_to_ids(t) for t in names]
print(dict(zip(names, ids)))
print("additional_special_tokens:", tokenizer.special_tokens_map.get("additional_special_tokens"))

BNB_4BIT_COMPUTE_DTYPE = "bfloat16"

compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)

bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=compute_dtype,
  bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

base = AutoModelForCausalLM.from_pretrained(
  model_name,
  load_in_8bit=False,
  quantization_config=bnb_config,
  dtype=torch.bfloat16,
  device_map="auto",
  use_cache=False,
  trust_remote_code=True,
  attn_implementation="flash_attention_2",
)

train_dataset = load_dataset("json", data_files="/content/train.jsonl", split="train")
val_dataset = load_dataset("json", data_files="/content/val.jsonl", split="train")

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
)

In [None]:
model = PeftModel.from_pretrained(base, adapter_name, is_trainable=True)
model.print_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=1,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_ratio=0.03,
    logging_strategy="epoch",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    gradient_checkpointing=True,
    report_to=["wandb"],
    push_to_hub=False,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    max_seq_length=4096,
)

In [None]:
train_start_time = time()
print("Training...")

try:
  trainer.train(resume_from_checkpoint=False)
  train_end_time = time()
  print(f"Training completed in {train_end_time - train_start_time:.2f} seconds.")
  eval_results = trainer.evaluate()
except e as Exception:
  print(f"Training failed: {e}")
finally:
  trainer.save_state()
  trainer.save_model(f"{trainer.args.output_dir}/last-safe")
wandb.finish()


if "eval_loss" in eval_results and math.isfinite(eval_results["eval_loss"]):
  eval_loss = eval_results["eval_loss"]
  ppl = math.exp(eval_loss)
  print(f"Eval loss = {eval_loss:.4f}, Perplexity = {ppl:.4f}")


In [None]:
trainer.push_to_hub()

In [None]:
from google.colab import runtime

runtime.unassign()