<a href="https://colab.research.google.com/github/tam1444AH/UH-Insure-NSA/blob/main/notebooks/codeLLMFT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import shutil
%pip install nbstripout

REPO_URL="https://github.com/tam1444AH/UH-Insure-NSA.git"
REPO="UH-Insure-NSA"

os.chdir("/content")

# If repo exists, update it; otherwise, clone fresh
if os.path.exists(REPO):
    print(f"Repo '{REPO}' exists, pulling latest changes...")
    os.chdir(REPO)
    !git reset --hard HEAD   # optional: discard local changes
    !git pull
else:
    print(f"Cloning repo '{REPO}'...")
    !git clone "$REPO_URL" "$REPO"
    os.chdir(REPO)

!nbstripout --install
!git branch -a


# Install dependencies if present
if os.path.exists("requirements.txt"):
    !pip install -r requirements.txt
if os.path.exists("pyproject.toml"):
    !pip install -e .

In [None]:
from model.test import test
test()

In [None]:
!pip -q install -U huggingface_hub hf_transfer
!export HF_HUB_ENABLE_HF_TRANSFER=1

import os
from google.colab import userdata
from huggingface_hub import login, whoami

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HF_TOKEN')
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
os.chdir("/content")
!pip install -q transformers datasets

import json, random
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer

In [None]:
!pip install -q transformers datasets peft bitsandbytes flash-attn

MODEL = "Qwen/Qwen3-Coder-30B-A3B-Instruct"  # Model checkpoint on the Hugging Face Hub
DATASET = "/content/all_hybrid.jsonl" # Josh's preproccesed dataset.
DATA_COLUMN = "content"  # Column name containing the code content

SEQ_LENGTH = 4096  # Sequence length

# Training arguments
MAX_STEPS = 200 #2000  # max_steps
NUM_EPOCHS = 1
BATCH_SIZE = 12  # batch_size
GR_ACC_STEPS = 2  # gradient_accumulation_steps
LR = 2e-4                     # learning_rate
WARMUP_RATIO = 0.03
WEIGHT_DECAY = 0.05
LR_SCHEDULER_TYPE = "cosine"  # lr_scheduler_type
WEIGHT_DECAY = 0.05  # weight_decay
NUM_WARMUP_STEPS = 15  # num_warmup_steps
EVAL_FREQ = 25
SAVE_FREQ = 50
LOG_FREQ = 10
OUTPUT_DIR = "peft-FT-3-Coder-30b-v2"  # output_dir
BF16 = True  # bf16
FP16 = False  # no_fp16

# FIM trasformations arguments
FIM_RATE = 0.25  # fim_rate
FIM_SPM_RATE = 0.5  # fim_spm_rate


# LORA
LORA_R = 16  # lora_r
LORA_ALPHA = 32  # lora_alpha
LORA_DROPOUT = 0.05  # lora_dropout
LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj", "gate"]  # lora_target_modules  # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT = True  # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # bnb_4bit_compute_dtype

SEED = 0

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainerCallback
)

set_seed(SEED)

In [None]:
import torch
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

dataset = load_dataset("json", data_files={"data": DATASET})["data"]

unsupervised = dataset.filter(lambda x: x["set"] == "unsupervised")

cryptol = unsupervised.filter(lambda x: x["filetype"] == "cry")
saw = unsupervised.filter(lambda x: x["filetype"] == "saw")
text = unsupervised.filter(lambda x: x["filetype"] == "txt")

cryptol_split = cryptol.train_test_split(test_size=0.1, seed=42)
saw_split = saw.train_test_split(test_size=0.1, seed=42)
text_split = text.train_test_split(test_size=0.1, seed=42)

train_ds = concatenate_datasets([cryptol_split["train"], saw_split["train"], text_split["train"]])
eval_ds = concatenate_datasets([cryptol_split["test"], saw_split["test"], text_split["test"]])

print(train_ds[0].keys())
assert DATA_COLUMN in train_ds.column_names, f"Missing '{DATA_COLUMN}' in JSONL!"

print(eval_ds[0].keys())
assert DATA_COLUMN in eval_ds.column_names, f"Missing '{DATA_COLUMN}' in JSONL!"

print(len(train_ds), len(eval_ds))

def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """

    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        total_characters += len(example[data_column])
        total_tokens += len(tokenizer(example[data_column]).tokens())

    return total_characters / total_tokens


chars_per_token = chars_token_ratio(train_ds, tokenizer, DATA_COLUMN)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")

In [None]:
from model.dataset.util import *

In [None]:
from model.dataset.constantLengthDataset import ConstantLengthDataset
import torch


train_dataset = ConstantLengthDataset(
        tokenizer=tokenizer,
        dataset=train_ds,
        infinite=False,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=DATA_COLUMN,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        overlap_ratio=0.25,
        seed=SEED,
        already_tokenized=False,
)
eval_dataset = ConstantLengthDataset(
        tokenizer=tokenizer,
        dataset=eval_ds,
        infinite=False,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=DATA_COLUMN,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        overlap_ratio=0.25,
        seed=SEED,
        already_tokenized=False,
)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer

load_in_8bit = False

# 4-bit quantization
compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

base = AutoModelForCausalLM.from_pretrained(
        MODEL,
        load_in_8bit=load_in_8bit,
        quantization_config=bnb_config,
        dtype=torch.bfloat16,
        device_map="auto",
        use_cache=False,  # We will be using gradient checkpointing
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
)

base = prepare_model_for_kbit_training(base)

In [13]:
blk = base.model.layers[0]           # Llama/Qwen-style
print("ATTN:", blk.self_attn)         # has q_proj, k_proj, v_proj, o_proj
print("MLP:", blk.mlp)
target_modules = LORA_TARGET_MODULES

ATTN: Qwen3MoeAttention(
  (q_proj): Linear4bit(in_features=2048, out_features=4096, bias=False)
  (k_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
  (v_proj): Linear4bit(in_features=2048, out_features=512, bias=False)
  (o_proj): Linear4bit(in_features=4096, out_features=2048, bias=False)
  (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
  (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
)
MLP: Qwen3MoeSparseMoeBlock(
  (gate): Linear4bit(in_features=2048, out_features=128, bias=False)
  (experts): ModuleList(
    (0-127): 128 x Qwen3MoeMLP(
      (gate_proj): Linear4bit(in_features=2048, out_features=768, bias=False)
      (up_proj): Linear4bit(in_features=2048, out_features=768, bias=False)
      (down_proj): Linear4bit(in_features=768, out_features=2048, bias=False)
      (act_fn): SiLUActivation()
    )
  )
)


In [14]:
# Set up lora
from peft import PeftModel

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

# model = PeftModel.from_pretrained(base, "tam2003/peft-FT-3-Coder-30b") # If your just trying to test the model.
model = get_peft_model(base, peft_config) # If you want to continue training.

model.print_trainable_parameters()

trainable params: 15,040,512 || all params: 30,547,163,136 || trainable%: 0.0492


In [15]:
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
class PerplexityCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        if state.log_history and "eval_loss" in state.log_history[-1]:
            eval_loss = state.log_history[-1]["eval_loss"]
            ppl = math.exp(eval_loss) if eval_loss < 20 else float("inf")
            print(f"\n>>> Epoch {int(state.epoch) if state.epoch is not None else '?'} | "
                  f"eval_loss={eval_loss:.4f} | perplexity={ppl:.3f}\n")


In [16]:
train_ds.start_iteration = 0

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    dataloader_drop_last=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    report_to=["tensorboard"],
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_ratio=WARMUP_RATIO,
    weight_decay=WEIGHT_DECAY,
    gradient_accumulation_steps=GR_ACC_STEPS,
    gradient_checkpointing=True,
    fp16=FP16,
    bf16=BF16,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    push_to_hub=True,
    include_tokens_per_second=True,
    resume_from_checkpoint=False,
    dataloader_num_workers=4,
    dataloader_prefetch_factor=2,
)

print(f"Training samples: {len(train_ds)}")
print(f"Sample text:\n\n{train_ds[0][DATA_COLUMN][:400]}")

Training samples: 671
Sample text:

module Primitive::Symmetric::Cipher::Block::Modes::TDES_CBC where

import Primitive::Symmetric::Cipher::Block::TripleDES (encrypt, decrypt)

// Test vectors from https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/TDES_CBC.pdf
type iv = [64]
type block = [64]
type k1 = [64]
type k2 = [64]
type k3 = [64]

cbcTDesEnc : {n} (fin n) => k1 -> k2 -> k3 -> 


In [17]:
import math

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
    callbacks=[PerplexityCallback()],
)

print("Training...")
trainer.train(resume_from_checkpoint=False)

eval_results = trainer.evaluate()
eval_loss = eval_results["eval_loss"]
perplexity = math.exp(eval_loss)
print(f"Eval loss = {eval_loss:.2f}, Perplexity = {perplexity:.2f}")

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 151643}.


Training...


Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


OutOfMemoryError: CUDA out of memory. Tried to allocate 27.82 GiB. GPU 0 has a total capacity of 79.32 GiB of which 923.88 MiB is free. Process 33992 has 78.41 GiB memory in use. Of the allocated memory 77.89 GiB is allocated by PyTorch, and 24.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.push_to_hub()

In [None]:
!pip install -q transformers peft
!pip install -q transformers peft bitsandbytes accelerate

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# Base model load
base = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen3-Coder-30B-A3B-Instruct",
    dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
)

# Load LoRA fine-tuned adapter on top of it
model = PeftModel.from_pretrained(base, "tam2003/peft-FT-3-Coder-30b")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Coder-30B-A3B-Instruct", trust_remote_code=True)
model.eval()


In [None]:
from transformers import pipeline
import gc, torch
gc.collect()
torch.cuda.empty_cache()

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device_map="balanced_low_0",
    dtype="bfloat16",
)

prompt = """
Translate this English spec into Cryptol:

"A function sha256_hash that takes a 512-bit input block and returns a 256-bit digest."
"""
result = pipe(prompt, max_new_tokens=300)
print(result[0]["generated_text"])

prompts = [
    'Translate this English spec into Cryptol:\n\n"A function xor_inverse that proves (a XOR b) XOR b == a"',
    'Translate this English spec into Cryptol:\n\n"A function sha256_hash that takes a 512-bit input and returns a 256-bit digest"',
    'Translate this English spec into Cryptol:\n\n"Create a property proving that addition is commutative for 8-bit words."'
]

for p in prompts:
    print("\n=== Prompt ===")
    print(p)
    inputs = tokenizer(p, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=150)
    print("\n--- Generated Cryptol ---")
    print(tokenizer.decode(outputs[0], skip_special_tokens=True))
