In [2]:
import os
import csv

# Root directory containing all CWE folders
JULIET_ROOT = "juliet/testcases"

# We’ll store rows in a list of dicts, each dict has 'code' and 'label'
dataset_rows = []

for cwe_folder in os.listdir(JULIET_ROOT):
    full_cwe_path = os.path.join(JULIET_ROOT, cwe_folder)

    # Skip if it's not a directory
    if not os.path.isdir(full_cwe_path):
        continue

    # This is a simplistic approach:
    # Let's assume "CWE###" in folder name => "vulnerable" label
    # but you can refine logic to handle "safe" vs. "vulnerable" testcases
    label = "vulnerable"  # or you can do a multi-label approach based on `cwe_folder`

    # Now gather the .c/.cpp files
    for file_name in os.listdir(full_cwe_path):
        if file_name.endswith(".c") or file_name.endswith(".cpp"):
            file_path = os.path.join(full_cwe_path, file_name)

            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                code_content = f.read()

                # Build a row
                row_dict = {
                    "code": code_content,
                    "label": label
                }
                dataset_rows.append(row_dict)

# Finally, write to CSV
csv_file = "juliet_dataset.csv"
with open(csv_file, mode="w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["code", "label"])
    writer.writeheader()
    writer.writerows(dataset_rows)

print(f"Created CSV: {csv_file} with {len(dataset_rows)} rows.")

Created CSV: juliet_dataset.csv with 16149 rows.


In [17]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# Hugging Face imports
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer
)

# PEFT imports
from peft import LoraConfig, get_peft_model, TaskType

# ==========================================
# 0) Label Mapping
# ==========================================
label2id = {"safe": 0, "vulnerable": 1}

def encode_labels(example):
    example["label"] = label2id[example["label"]]
    return example

# ==========================================
# 1) Load Dataset (All Juliet)
# ==========================================
data_files = "juliet_dataset.csv"
raw_dataset = load_dataset("csv", data_files=data_files)

# Split into train/test
split_data = raw_dataset["train"].train_test_split(test_size=0.2, seed=42)
train_dataset = split_data["train"]
eval_dataset  = split_data["test"]

# Map labels from string -> int
train_dataset = train_dataset.map(encode_labels)
eval_dataset  = eval_dataset.map(encode_labels)

print("Train dataset size:", len(train_dataset))
print("Eval dataset size:",  len(eval_dataset))

# ==========================================
# 2) Base Model Path & Tokenizer
# ==========================================
base_model_path = "/Users/ehsan/.llama/checkpoints/Llama3.2-1B-hf"

tokenizer = AutoTokenizer.from_pretrained(base_model_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# ==========================================
# 3) Tokenization
# ==========================================
def tokenize_function(examples):
    return tokenizer(examples["code"], truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset  = eval_dataset.map(tokenize_function,  batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_dataset.set_format("torch",  columns=["input_ids", "attention_mask", "label"])

# ==========================================
# 4) Data Collator
# ==========================================
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

# ==========================================
# 5) Compute Metrics
# ==========================================
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# ==========================================
# 6) Load Base Model & LoRA
# ==========================================
model = AutoModelForSequenceClassification.from_pretrained(
    base_model_path,
    num_labels=2,  # If you have more classes, adjust
    device_map="auto"
)
model.config.pad_token_id = tokenizer.pad_token_id

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS
)
model = get_peft_model(model, lora_config)
print("LoRA-wrapped model created.")

# ==========================================
# 7) Hyperparameters & Training
# ==========================================
num_train_epochs = 3
learning_rate    = 1e-4
batch_size       = 4
grad_accum_steps = 2

training_args = TrainingArguments(
    output_dir="./checkpoints/llama_vuln_lora",
    num_train_epochs=num_train_epochs,
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=grad_accum_steps,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=20,
    fp16=False,       # If on a GPU with mixed precision, set True
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# ==========================================
# 8) Train & Save
# ==========================================
trainer.train()

save_path = "./lora_vuln_model_seqcls"
trainer.save_model(save_path)
print(f"Done! Model + LoRA adapter saved to: {save_path}")

Map:   0%|          | 0/12919 [00:00<?, ? examples/s]

Map:   0%|          | 0/3230 [00:00<?, ? examples/s]

Train dataset size: 12919
Eval dataset size: 3230


Map:   0%|          | 0/12919 [00:00<?, ? examples/s]

Map:   0%|          | 0/3230 [00:00<?, ? examples/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /Users/ehsan/.llama/checkpoints/Llama3.2-1B-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


LoRA-wrapped model created.


  0%|          | 0/4845 [00:00<?, ?it/s]

{'loss': 0.3161, 'grad_norm': 0.17329414188861847, 'learning_rate': 9.958720330237359e-05, 'epoch': 0.01}
{'loss': 0.0113, 'grad_norm': 0.5253696441650391, 'learning_rate': 9.917440660474717e-05, 'epoch': 0.02}
{'loss': 0.0007, 'grad_norm': 3.121109330095351e-05, 'learning_rate': 9.876160990712075e-05, 'epoch': 0.04}
{'loss': 0.0, 'grad_norm': 4.585513306665234e-05, 'learning_rate': 9.834881320949433e-05, 'epoch': 0.05}
{'loss': 0.0, 'grad_norm': 6.393916555680335e-05, 'learning_rate': 9.793601651186791e-05, 'epoch': 0.06}
{'loss': 0.0, 'grad_norm': 1.1325458217470441e-05, 'learning_rate': 9.75232198142415e-05, 'epoch': 0.07}
{'loss': 0.0, 'grad_norm': 0.0009136826847679913, 'learning_rate': 9.711042311661507e-05, 'epoch': 0.09}
{'loss': 0.0, 'grad_norm': 0.0006551204132847488, 'learning_rate': 9.669762641898865e-05, 'epoch': 0.1}
{'loss': 0.0, 'grad_norm': 8.667825568409171e-06, 'learning_rate': 9.628482972136223e-05, 'epoch': 0.11}
{'loss': 0.0, 'grad_norm': 0.0005751991411671042, 'l

  0%|          | 0/808 [00:00<?, ?it/s]

{'eval_loss': 8.04194684178583e-08, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 2170.6947, 'eval_samples_per_second': 1.488, 'eval_steps_per_second': 0.372, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': 4.880199071521929e-07, 'learning_rate': 6.656346749226007e-05, 'epoch': 1.0}
{'loss': 0.0, 'grad_norm': 5.196587267164432e-07, 'learning_rate': 6.615067079463365e-05, 'epoch': 1.02}
{'loss': 0.0, 'grad_norm': 1.0422053264846909e-07, 'learning_rate': 6.573787409700723e-05, 'epoch': 1.03}
{'loss': 0.0, 'grad_norm': 5.016752766096033e-06, 'learning_rate': 6.532507739938081e-05, 'epoch': 1.04}
{'loss': 0.0, 'grad_norm': 2.3621156231001805e-07, 'learning_rate': 6.49122807017544e-05, 'epoch': 1.05}
{'loss': 0.0, 'grad_norm': 1.2667730686644063e-07, 'learning_rate': 6.449948400412798e-05, 'epoch': 1.07}
{'loss': 0.0, 'grad_norm': 1.1253678167122416e-07, 'learning_rate': 6.408668730650154e-05, 'epoch': 1.08}
{'loss': 0.0, 'grad_norm': 4.225732368468016e-07, 'learning_rate': 6.3673890608875

  0%|          | 0/808 [00:00<?, ?it/s]

{'eval_loss': 3.5984133006650154e-08, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 231.4244, 'eval_samples_per_second': 13.957, 'eval_steps_per_second': 3.491, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': 1.1413741418664358e-07, 'learning_rate': 3.3126934984520126e-05, 'epoch': 2.01}
{'loss': 0.0, 'grad_norm': 8.953241376730148e-06, 'learning_rate': 3.271413828689371e-05, 'epoch': 2.02}
{'loss': 0.0, 'grad_norm': 4.606088168657152e-07, 'learning_rate': 3.230134158926729e-05, 'epoch': 2.03}
{'loss': 0.0, 'grad_norm': 4.8401695096345065e-08, 'learning_rate': 3.188854489164087e-05, 'epoch': 2.04}
{'loss': 0.0, 'grad_norm': 2.4661881070642266e-07, 'learning_rate': 3.147574819401445e-05, 'epoch': 2.06}
{'loss': 0.0, 'grad_norm': 5.339597919373773e-07, 'learning_rate': 3.106295149638803e-05, 'epoch': 2.07}
{'loss': 0.0, 'grad_norm': 3.9440166688109457e-07, 'learning_rate': 3.065015479876161e-05, 'epoch': 2.08}
{'loss': 0.0, 'grad_norm': 7.583768137919833e-07, 'learning_rate': 3.02373581

  0%|          | 0/808 [00:00<?, ?it/s]

{'eval_loss': 2.6572919864520372e-08, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 263.5311, 'eval_samples_per_second': 12.257, 'eval_steps_per_second': 3.066, 'epoch': 3.0}
{'train_runtime': 36364.1084, 'train_samples_per_second': 1.066, 'train_steps_per_second': 0.133, 'train_loss': 0.0013553814143949456, 'epoch': 3.0}
Done! Model + LoRA adapter saved to: ./lora_vuln_model_seqcls
