<a href="https://colab.research.google.com/github/tam1444AH/COSC4397Project/blob/main/notebooks/supervised-data-preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install nbstripout
%pip install -U "huggingface-hub>=0.34.0,<1.0"
%pip check
%pip install hf_transfer
%pip install -U bitsandbytes --upgrade
%pip install transformers datasets
%pip install transformers datasets peft flash-attn trl
!export HF_HUB_ENABLE_HF_TRANSFER=1

from google.colab import auth
auth.authenticate_user()
import json, random
from datasets import load_dataset, Dataset, concatenate_datasets
import os, math, torch
import wandb
import shutil
from datetime import datetime
from google.colab import userdata
from huggingface_hub import login, whoami
from time import time
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import PeftModel


os.environ["WANDB_DISABLED"] = "false"  # or "true" to mute
os.environ["WANDB_PROJECT"]   = "qwen3coder-finetune-fp16"

os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
REPO_URL="https://github.com/UH-Insure/Finetuning-Qwen3.git"
REPO="Finetuning-Qwen3"

os.chdir("/content")

# If repo exists, update it; otherwise, clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD   # optional: discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

!nbstripout --install
!git branch -a


# Install dependencies if present
if os.path.exists("requirements.txt"):
    %pip install -r requirements.txt
if os.path.exists("pyproject.toml"):
    %pip install -e .

In [None]:
seed = 4371
data = "/content/"
base = "Qwen/Qwen3-Coder-30B-Instruct"
adapter_name = "tam2003/Qwen3-Coder-30b-v5-2ep"
output_dir = "tam2003/Qwen3-Coder-30b-v5-2ep-sft"
epochs = 1
per_dev_bs = 5
grad_acc = 2
lr = 5e-5
warmup_ratio = 0.03
max_seq_len = 4096

In [None]:
os.environ["WANDB_PROJECT"] = "qwen3-sft-test"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=f"sfttrainer-1ep-resume-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    settings=wandb.Settings(ignore_globs=["*.bin","*.pt","*.safetensors","*.ckpt","checkpoint*"])
)
wandb.define_metric("train/global_step")
wandb.define_metric("train/*", step_metric="train/global_step")
wandb.define_metric("eval/*",  step_metric="train/global_step")

random.seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")