<a href="https://colab.research.google.com/github/tam1444AH/COSC4397Project/blob/main/notebooks/supervised-data-preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install nbstripout
%pip install -U "huggingface-hub>=0.34.0,<1.0"
%pip check
%pip install hf_transfer
%pip install -U bitsandbytes --upgrade
%pip install transformers datasets
%pip install transformers datasets peft flash-attn trl
!export HF_HUB_ENABLE_HF_TRANSFER=1

from google.colab import auth
auth.authenticate_user()
import json, random
from datasets import load_dataset, Dataset, concatenate_datasets
import os, math, torch
import wandb
import shutil
from datetime import datetime
from google.colab import userdata
from huggingface_hub import login, whoami
from time import time
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
)
from trl import SFTTrainer
from peft import PeftModel


os.environ["WANDB_DISABLED"] = "false"  # or "true" to mute
os.environ["WANDB_PROJECT"]   = "qwen3coder-finetune-fp16"

os.environ["WANDB_LOG_MODEL"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
REPO_URL="https://github.com/UH-Insure/Finetuning-Qwen3.git"
REPO="Finetuning-Qwen3"

os.chdir("/content")

# If repo exists, update it; otherwise, clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD   # optional: discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

!nbstripout --install
!git branch -a


# Install dependencies if present
if os.path.exists("requirements.txt"):
    %pip install -r requirements.txt
if os.path.exists("pyproject.toml"):
    %pip install -e .

Collecting nbstripout
  Downloading nbstripout-0.8.1-py2.py3-none-any.whl.metadata (19 kB)
Downloading nbstripout-0.8.1-py2.py3-none-any.whl (16 kB)
Installing collected packages: nbstripout
Successfully installed nbstripout-0.8.1
ipython 7.34.0 requires jedi, which is not installed.
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2
Collecting flash-attn
  Downloading flash_attn-2.8.3.tar.gz (8.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m148.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (1

In [5]:
seed = 4371
dataset_path = "/content/SFT_message_format_hybrid_source_code_V2.jsonl"
base = "Qwen/Qwen3-Coder-30B-A3B-Instruct"
adapter_name = "tam2003/Qwen3-Coder-30b-v5-2ep"
output_dir = "tam2003/Qwen3-Coder-30b-v5-2ep-sft"
epochs = 1
per_dev_bs = 5
grad_acc = 2
lr = 5e-5
warmup_ratio = 0.03
max_seq_len = 4096

In [3]:
os.environ["WANDB_PROJECT"] = "qwen3-sft-test"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
WANDB_TOKEN = userdata.get('WANDB_KEY')
os.environ["WANDB_API_KEY"] = WANDB_TOKEN
os.environ["HF_TOKEN"] = HF_TOKEN
wandb.login(key=WANDB_TOKEN, relogin=True)
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])
wandb.init(
    project=os.environ["WANDB_PROJECT"],
    name=f"sfttrainer-1ep-resume-{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    settings=wandb.Settings(ignore_globs=["*.bin","*.pt","*.safetensors","*.ckpt","checkpoint*"])
)
wandb.define_metric("train/global_step")
wandb.define_metric("train/*", step_metric="train/global_step")
wandb.define_metric("eval/*",  step_metric="train/global_step")

random.seed(seed)
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmohammedtalha290[0m ([33mmohammedtalha290-university-of-houston[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Logged in as: tam2003


In [6]:
tokenizer = AutoTokenizer.from_pretrained(base, trust_remote_code=True, use_fast=True)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base = AutoModelForCausalLM.from_pretrained(
    base,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    use_cache=False,
    attn_implementation="flash_attention_2",
)
model = PeftModel.from_pretrained(base, adapter_name, is_trainable=True)
model.print_trainable_parameters()

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/992 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00002-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00007-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00001-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00008-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00006-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00009-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00010-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00011-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00012-of-00016.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00013-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00014-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00015-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00016-of-00016.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/180 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/945 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/3.38G [00:00<?, ?B/s]

trainable params: 843,841,536 || all params: 31,375,964,160 || trainable%: 2.6895


In [7]:
raw_data = load_dataset("json", data_files={"data": dataset_path})["data"]

supervised_set = raw_data.filter(lambda x: x["set"] == "supervised")

cryptol = supervised_set.filter(lambda x: x["filetype"] == "cryptol")
saw = supervised_set.filter(lambda x: x["filetype"] == "saw")
# No text as of yet

cryptol_split = cryptol.train_test_split(test_size=0.1, seed=seed, shuffle=True)
saw_split = saw.train_test_split(test_size=0.1, seed=seed, shuffle=True)

train_dataset = concatenate_datasets([cryptol_split["train"], saw_split["train"]])
eval_dataset = concatenate_datasets([cryptol_split["test"], saw_split["test"]])

print(f"Train dataset size: {len(train_dataset)}")
print(f"Eval dataset size: {len(eval_dataset)}")

Generating data split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/939 [00:00<?, ? examples/s]

Filter:   0%|          | 0/188 [00:00<?, ? examples/s]

Filter:   0%|          | 0/188 [00:00<?, ? examples/s]

Train dataset size: 168
Eval dataset size: 20


In [8]:
from dataclasses import dataclass
from typing import List, Dict, Any

@dataclass
class AssistantOnlyCollator:
    tokenizer: Any
    max_length: int = max_seq_len
    pad_to_multiple_of: int = 8

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        texts_full, texts_prefix = [], []
        for ex in features:
            msgs = ex["messages"]

            last_ass = max(i for i,m in enumerate(msgs) if m["role"]=="assistant")
            prefix = msgs[:last_ass]
            texts_full.append(self.tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False))
            texts_prefix.append(self.tokenizer.apply_chat_template(prefix, tokenize=False, add_generation_prompt=False))

        enc_full   = self.tokenizer(texts_full,  max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")
        enc_prefix = self.tokenizer(texts_prefix, max_length=self.max_length, truncation=True, padding=True, return_tensors="pt")

        input_ids = enc_full["input_ids"]
        attn_mask = enc_full["attention_mask"]
        labels    = input_ids.clone()

        for i in range(input_ids.size(0)):
            pref_len = int((enc_prefix["attention_mask"][i]==1).sum())
            labels[i, :pref_len] = -100
        labels[attn_mask == 0] = -100

        return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": labels}

collator = AssistantOnlyCollator(tokenizer)

In [48]:
import os, gc, torch
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
gc.collect(); torch.cuda.empty_cache()

In [51]:
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=1,
    learning_rate=lr,
    warmup_ratio=warmup_ratio,
    logging_steps=10,
    save_steps=10,
    eval_steps=10,
    eval_strategy="steps",
    logging_strategy="steps",
    bf16 = True,
    gradient_checkpointing=True,
    report_to=["wandb"],
    push_to_hub=False,
    remove_unused_columns=False,
    dataloader_num_workers=2,
    optim="paged_adamw_8bit",
)

In [52]:
from transformers import Trainer, TrainingArguments

In [53]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=collator,
)

In [54]:
print("Training…")
train_start = time()
eval_results = {}

trainer.train()
print(f"Training completed in {time()-train_start:.1f} sec")

if trainer.eval_dataset is not None and len(trainer.eval_dataset) > 0:
    eval_results = trainer.evaluate()
    print("Eval:", eval_results)

trainer.save_state()
trainer.save_model(f"{args.output_dir}/last-safe")
wandb.finish()

if "eval_loss" in eval_results and math.isfinite(eval_results["eval_loss"]):
    ppl = math.exp(eval_results["eval_loss"])
    print(f"Eval loss = {eval_results['eval_loss']:.4f}  |  Perplexity = {ppl:.4f}")

Training…


Step,Training Loss,Validation Loss
10,1.0673,0.985706
20,0.7572,0.936784
30,1.0052,0.913181
40,1.0366,0.902236


Training completed in 2852.2 sec


Eval: {'eval_loss': 0.9044327735900879, 'eval_runtime': 68.1283, 'eval_samples_per_second': 0.294, 'eval_steps_per_second': 0.073, 'epoch': 1.0}


0,1
eval/loss,█▄▂▁▁
eval/runtime,▇███▁
eval/samples_per_second,▂▁▁▁█
eval/steps_per_second,▂▁▁▁█
train/epoch,▁▁▃▃▅▅████
train/global_step,▁▁▃▃▅▅████
train/grad_norm,█▇▁▅
train/learning_rate,█▆▃▁
train/loss,█▁▇▇

0,1
eval/loss,0.90443
eval/runtime,68.1283
eval/samples_per_second,0.294
eval/steps_per_second,0.073
total_flos,5.93439557872681e+16
train/epoch,1
train/global_step,42
train/grad_norm,0.56225
train/learning_rate,0.0
train/loss,1.0366


Eval loss = 0.9044  |  Perplexity = 2.4705


In [55]:
trainer.push_to_hub()

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...v5-2ep-sft/tokenizer.json:  97%|#########7| 11.1MB / 11.4MB            

  .../last-safe/tokenizer.json:  97%|#########7| 11.1MB / 11.4MB            

  ...adapter_model.safetensors:   0%|          | 5.38MB / 3.38GB            

  ...adapter_model.safetensors:   0%|          | 5.38MB / 3.38GB            

  ...st-safe/training_args.bin:   9%|8         |   508B / 5.84kB            

  ...2ep-sft/training_args.bin:   9%|8         |   508B / 5.84kB            

CommitInfo(commit_url='https://huggingface.co/tam2003/Qwen3-Coder-30b-v5-2ep-sft/commit/cbb97c90ac2405c0f99e382c23f005017158b9d0', commit_message='End of training', commit_description='', oid='cbb97c90ac2405c0f99e382c23f005017158b9d0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tam2003/Qwen3-Coder-30b-v5-2ep-sft', endpoint='https://huggingface.co', repo_type='model', repo_id='tam2003/Qwen3-Coder-30b-v5-2ep-sft'), pr_revision=None, pr_num=None)

In [58]:
model.eval()
model = model.to(dtype=torch.float16)

prompt_msgs = [
    {"role":"system","content":"Return exactly ONE fenced code block labeled `cryptol` and nothing else (no prose before/after)."},
    {"role":"user","content":"Implement a function `xor8` that takes two 8-bit words and returns their XOR. Also declare a constant `zero8`."}
]

prompt_text = tokenizer.apply_chat_template(prompt_msgs, tokenize=False, add_generation_prompt=True)
enc = tokenizer(prompt_text, return_tensors="pt").to(model.device)

with torch.no_grad():
    gen = model.generate(**enc, max_new_tokens=256, do_sample=False, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(gen[0, enc.input_ids.shape[-1]:], skip_special_tokens=True))

```cryptol
zero8 : [8]
zero8 = 0

xor8 : [8] -> [8] -> [8]
xor8 x y = x ^ y
```


In [59]:
from google.colab import runtime

runtime.unassign()