In [1]:
import glob
from typing import TypedDict

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
import json
import json
import datasets
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
from peft import get_peft_model, LoraConfig, TaskType
import json
import argparse
from Utils import DynamicAttributes

# Some global variables
model_name = "meta-llama/Llama-3.1-8B-Instruct"
MAX_TOKENS_LENGTH = 8192

# Model and tokenizer variables that will be set later and used in the script
tokenizer = None
model = None
train_data = None
val_data = None

gParams = DynamicAttributes()

# =====================
# Load and format dataset
# =====================
def load_flat_json(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    return [list(e.values())[0] for e in raw]

In [2]:

# === Tokenizer and label masking ===
def tokenize_with_labels(example, mode="full"):
    text = example["text"]
    tokens = tokenizer(text, truncation=True, padding="max_length", max_length=2048)
    input_ids = tokens["input_ids"]

    # Find where the target starts and ends. We basically mask out the prefix and suffix
    if mode == "mitigations":
        start = text.find("### Predicted Mitigations:")
        end = text.find("### Generated CACAO Playbook:")
    elif mode == "playbook":
        start = text.find("### Generated CACAO Playbook:")
        end = len(text)
    else:
        start = 0
        end = len(text)

    # Tokenize the prefix and suffix
    tokenized_prefix = tokenizer(text[:start], truncation=True, max_length=MAX_TOKENS_LENGTH)["input_ids"]
    tokenized_suffix = tokenizer(text[:end], truncation=True, max_length=MAX_TOKENS_LENGTH)["input_ids"]

    # Create labels and mask the prefix and suffix
    labels = input_ids.copy()
    labels[:len(tokenized_prefix)] = [-100] * len(tokenized_prefix)
    labels[len(tokenized_suffix):] = [-100] * (len(labels) - len(tokenized_suffix))

    tokens["labels"] = labels
    return tokens

In [3]:
import torch
# =====================
# Load model and tokenizer
# =====================
def load_model_and_tokenizer():
    global tokenizer
    global model

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto"
    )

    # Apply LoRA - maybe in params in the future ?
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM,
    )

    model = get_peft_model(model, lora_config)

    # Force model to bf16
    model = model.to(dtype=torch.bfloat16)

    model.print_trainable_parameters()

def parse_args():
    global gParams
    # =====================
    # Configurable CLI
    # =====================
    parser = argparse.ArgumentParser()
    parser.add_argument("--phase", type=str, default="full", choices=["mitigations", "playbook", "full"],
                        help="Which task phase to train on")
    args = parser.parse_args()
    gParams.finetune_mode = args.phase


In [4]:

def load_datasets():
    global train_data
    global val_data
    global train_data_subset

    # =====================
    # Load data and split by technique
    # =====================
    train_raw = load_flat_json("Dataset/Main/dataset_train.json")
    val_raw = load_flat_json("Dataset/Main/dataset_val.json")

    if gParams.subset_mode:
        print("⚡ Subsetting data for quick testing...")
        train_raw = train_raw[:100]  # Keep only 100 examples for fast training
        val_raw = val_raw[:20]  # Keep 20 examples for fast validation



    train_dataset = Dataset.from_list([format_for_completion(e) for e in train_raw])
    val_dataset = Dataset.from_list([format_for_completion(e) for e in val_raw])

    print(f"Train size: {len(train_dataset)}")
    print(f"Validation size: {len(val_dataset)}")

    train_data = train_dataset.map(lambda ex: tokenize_with_labels(ex, mode=gParams.finetune_mode))
    val_data = val_dataset.map(lambda ex: tokenize_with_labels(ex, mode=gParams.finetune_mode))


In [5]:
from transformers import TrainerCallback

class SaveModelAtEpochEndCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Saving model at end of epoch {state.epoch}")
        control.should_save = True
        return control

def train_model(subset=False):
    # =====================
    # Training setup
    # =====================
    training_args = TrainingArguments(
        output_dir=f"./llama3-cacao-checkpoints-{gParams.finetune_mode}",
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        learning_rate=2e-5,
        num_train_epochs=3,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=500,
        save_strategy="steps",
        eval_strategy="steps",
        eval_steps=500,
        load_best_model_at_end=True,          # Reload best model based on metric
        metric_for_best_model="eval_loss",    # Use eval loss to pick best model
        greater_is_better=False,              # Lower eval loss = better
        fp16=True,
        save_total_limit=2,
        report_to="none",
    )



    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        callbacks=[SaveModelAtEpochEndCallback()],
    )

    trainer.train()

    trainer.save_model(f"./llama3-cacao-final-{gParams.finetune_mode}")

In [6]:
load_model_and_tokenizer()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


In [7]:

# === Format dataset into prompt + completion style ===
def format_for_completion(example):
    if not isinstance(example, dict):
        raise TypeError("Example must be a dictionary")

    if len(example) == 0: # In this case the format is { incident_id : example}
        #print (f"Formatting example: {example}")
        incident_id = list(example.keys())[0]
        example = example[incident_id]

    incident = example.get("incident_description", "")
    logs = example.get("attack_logs", [])
    mitigations = example.get("ground_truth_mitigations", [])
    playbook = example.get("playbook", "")

    logs_text = "\n".join([
        f"- [{log['timestamp']}] {log['host']}: {log['action']} — {log['details']}"
        for log in logs if all(k in log for k in ["timestamp", "host", "action", "details"])
    ])


    def dict_to_str(d: dict, level=0) -> str:
        part_text = ""
        sep_char = "; " if level == 0 else ", "
        for key, value in d.items():
            if isinstance(value, str):
                part_text += f"{key}: {value}" + "; "
            elif isinstance(value, list):
                part_text += f"{key}: "
                for item in value:
                    if isinstance(item, str):
                        part_text += f"{item}" + ", "
                    if isinstance(item, dict):
                        res = dict_to_str(item, level+1)
                        part_text += f"{res}" + ", "

            elif isinstance(value, dict):
                part_text += f"{key}: "
                part_text += "{"
                part_text += dict_to_str(value, level+1)
                part_text += "}"

        return part_text


    def format_mitigation(mitigations_list: list[str | dict]) -> str:
        assert isinstance(mitigations_list, list), "Mitigations should be a list of strings or dicts"

        mitigations_text_out = ""
        for mitigation in mitigations_list:
            if isinstance(mitigations_list, str):
                mitigations_text_out += f"- {mitigation}\n"
            elif isinstance(mitigation, dict):
                mitigations_text_partial = "- "

                res = dict_to_str(mitigation, 0)
                mitigations_text_out += f"{res}\n"

        return mitigations_text_out

    # Format mitigations
    mitig_text = format_mitigation(mitigations)
    playbook_text = json.dumps(playbook, indent=2)

    full_text = f"""### Incident:
{incident}

### Logs:
{logs_text}

### Predicted Mitigations:
{mitig_text}

### Generated CACAO Playbook:
{playbook_text}"""

    return {"text": full_text}

In [8]:

gParams.finetune_mode = "mitigations" # Set the default mode to "mitigations"
gParams.subset_mode = True

# Step 2: Load the dataset
load_datasets()

⚡ Subsetting data for quick testing...
Train size: 100
Validation size: 20


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [1]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.amp import GradScaler, autocast
import os
import matplotlib.pyplot as plt
import ipdb

def is_running_in_notebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True
        else:
            return False
    except NameError:
        return False

USE_ACCELERATE = not is_running_in_notebook() # Turn this on to use accelerate


def save_checkpoint(model, tokenizer, save_dir, accelerator=None):
    if accelerator:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(save_dir, save_function=accelerator.save)
        tokenizer.save_pretrained(save_dir)
        accelerator.print(f"Model checkpoint saved at {save_dir}")
    else:
        # Standard pytorch save
        os.makedirs(save_dir, exist_ok=True)
        model.save_pretrained(save_dir)
        tokenizer.save_pretrained(save_dir)
        print(f"Model checkpoint saved at {save_dir}")

def evaluate_lowlevel(model, val_loader, accelerator):
    model.eval()
    losses = []
    with torch.no_grad():
        for batch in val_loader:
            outputs = model(**batch)
            loss = outputs.loss
            losses.append(loss.detach().cpu())
    losses = torch.cat(losses)
    model.train()  # Important: switch back to train mode after evaluation
    return losses.mean().item()

def custom_collate_fn(batch):
    # Batch is a list of dictionaries
    #print(f"Type of a batch: ": type(batch))
    
    ipdb.set_trace()
    new_batch = {}
    for key in batch[0].keys():
        new_batch[key] = torch.stack([torch.tensor(item[key]) for item in batch])
    return new_batch

def train_model_lowlevel(subset=False, learning_rate=2e-5, gradient_accumulation_steps=4, num_epochs=3,
                         batch_size_train=1, batch_size_eval=1):#, gpus_list="0,1,2,3,4,5,6,7"):
    global model

    # =====================
    # Accelerator setup
    # =====================
    #os.environ["CUDA_VISIBLE_DEVICES"] = gpus_list
    if USE_ACCELERATE:
        from accelerate import Accelerator
        from accelerate.utils import set_seed
        from torch.distributed.optim import ZeroRedundancyOptimizer

        accelerator = Accelerator(
            gradient_accumulation_steps=gradient_accumulation_steps,
            mixed_precision="bf16",  # or "fp16"
        )
    else:
        accelerator = None

    # =====================
    # Dataloaders
    # =====================
    train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size_train, collate_fn=custom_collate_fn)
    val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size_eval, collate_fn=custom_collate_fn)

    # =====================
    # Prepare model, optimizer and scheduler depending on accelerator usage
    # =====================

    if USE_ACCELERATE:
        # 1. Accelerate model and dataloaders (model gets cast to bf16)
        model, train_loader, val_loader = accelerator.prepare(model, train_loader, val_loader)

        # 2. Create optimizer and scheduler
        optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.AdamW, lr=learning_rate)

        total_steps = (len(train_loader) // gradient_accumulation_steps) * num_epochs
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

        # 3. Accelerate optimizer and scheduler
        optimizer, scheduler = accelerator.prepare(optimizer, scheduler)
    else:
        assert torch.cuda.is_available(), "CUDA is not available. Please use with GPU support."
        model = model.to(device="cuda" if torch.cuda.is_available() else "cpu")
        # Define optimizer and scheduler
        optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

        total_steps = (len(train_loader) // gradient_accumulation_steps) * num_epochs
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=total_steps)

        torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = True


    # =====================
    # Training loop
    # =====================
    model.train()
    global_step = 0
    save_steps = 500
    eval_steps = 500
    scaler = GradScaler(enabled=True)
    scaler_values = []


    for epoch in range(num_epochs):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        total_loss = 0

        for step, batch in enumerate(tqdm(train_loader)):
            if accelerator:
                with accelerator.accumulate(model):
                    with autocast(dtype=torch.bfloat16):
                        outputs = model(**batch)
                        loss = outputs.loss

                    accelerator.backward(loss)

                    if accelerator.sync_gradients:
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                        global_step += 1
            else:
                with autocast(dtype=torch.bfloat16):
                    outputs = model(**batch)
                    loss = outputs.loss
                scaler.scale(loss).backward()

                if (step + 1) % gradient_accumulation_steps == 0:
                    scaler.step(optimizer)
                    scaler.update()
                    optimizer.zero_grad()
                    global_step += 1
                    scaler_values.append(scaler.get_scale())


            # Logging
            if global_step % 10 == 0:
                if accelerator:
                    accelerator.print(f"Step {global_step}: loss = {loss.item():.4f}")
                else:
                    print(f"Step {global_step}: loss = {loss.item():.4f}")

            # Save checkpoint
            if global_step % save_steps == 0:
                save_dir = f"./llama3-cacao-checkpoints-{gParams.finetune_mode}/checkpoint-{global_step}"
                save_checkpoint(model, tokenizer, save_dir, accelerator)

            # Evaluate
            if global_step % eval_steps == 0:
                eval_loss = evaluate_lowlevel(model, val_loader, accelerator)
                if accelerator:
                    accelerator.print(f"Eval loss at step {global_step}: {eval_loss:.4f}")
                else:
                    print(f"Eval loss at step {global_step}: {eval_loss:.4f}")

        # Save at end of epoch
        save_dir = f"./llama3-cacao-checkpoints-{gParams.finetune_mode}/epoch-{epoch+1}"
        save_checkpoint(model, tokenizer, save_dir, accelerator)

    # =====================
    # Final model save
    # =====================
    final_save_dir = f"./llama3-cacao-final-{gParams.finetune_mode}"
    save_checkpoint(model, tokenizer, final_save_dir, accelerator)

    # Save the scaler values for analysis
    plt.figure(figsize=(10,6))
    plt.plot(scaler_values)
    plt.title("GradScaler Dynamic Loss Scale over Training Steps")
    plt.xlabel("Optimizer Step")
    plt.ylabel("Scaling Factor")
    plt.grid(True)
    plt.show()

train_model_lowlevel(subset=True, learning_rate=2e-5, gradient_accumulation_steps=4, num_epochs=3)#, gpus_list=gpus_list)


NameError: name 'train_data' is not defined

In [12]:
#
# Step 3: Train the model
#train_model(subset=True) # Set to True for testing purposes, False for full training

#gpus_list = "0" #"0,1,2,3,4,5,7"
train_model_lowlevel(subset=True, learning_rate=2e-5, gradient_accumulation_steps=4, num_epochs=3)#, gpus_list=gpus_list)



  scaler = GradScaler(enabled=True)


Epoch 1/3


  0%|          | 0/100 [00:00<?, ?it/s]


TypeError: new(): invalid data type 'str'

In [None]:
# Launch training with accelerate
# CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,7 accelerate launch --mixed_precision bf16 --multi_gpu train_model.py


# OR:
# accelerate config then accelerate launch train_model.py
