GENERATOR PART - Python 3.12
```python3 -m venv venv
\venv\Scripts\Activate.ps1 # for Windows
pip install -r requirements.txt
pip freeze > requirements.txt

print(torch.cuda.is_available())
print(torch.version.cuda)

## LLAMA

In [None]:
from unsloth import FastLanguageModel

model_name = "Meta-Llama-3.1-8B"

max_seq_length = 2048
dtype = None  # TODO: try torch.float16 or torch.bfloat16
load_in_4bit = True  # enable 4-bit quantization to reduce memory usage

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name, # if want to download: model_name=f"unsloth/{model_name}"
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

# settings from unsloth
model = FastLanguageModel.get_peft_model(
    model,
    r=16,            # use one of 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # 0 is optimized
    bias="none",     # "none" is optimized
    use_gradient_checkpointing="unsloth",  # "unsloth" for long context
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

# save the model
# model.save_pretrained(model_name)
# tokenizer.save_pretrained(model_name)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


  GPU_BUFFERS = tuple([torch.empty(2*256*2048, dtype = dtype, device = f"cuda:{i}") for i in range(n_gpus)])


==((====))==  Unsloth 2025.4.8: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 5080. Num GPUs = 1. Max memory: 15.92 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu128. CUDA: 12.0. CUDA Toolkit: 12.8. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.4.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.
Unsloth: Already have LoRA adapters! We shall skip this step.


In [2]:
from datasets import load_dataset

EOS_TOKEN = tokenizer.eos_token
def format_prompt(data):
    instructions = data["instruction"]
    inputs = data["input"]
    outputs = data["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts, }


# llama is using alpaca prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""


dataset = load_dataset("json", data_files="aptnotes_dataset.jsonl", split="train")
dataset = dataset.map(format_prompt, batched=True)

dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# TODO: adjust train test split!!!

In [5]:
train_dataset['text'][:5]

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nAnalyse the following APT report\n\n### Input:\n\n\n### Response:\nanalyzing closely your requirements and the physical and cyber environments in\nwhich you must operate, Arcanum Global s holistic team of technical, operational\nand management specialists will recommend specific (and potentially sensitive)\nsolutions and then stand beside you to implement them and assure you realize\nyour goals and achieve mission success.\nEmails published by Kazaword and analyzed by Mediapart allege that Arcanum\nemployed Bernard Squarcini, head of France s domestic intelligence agency, the\nDirection centrale du renseignement int rieur (DCRI) from 2007 to 2012, to inform the\nKazakh authorities of the progress of the legal proceedings against Ablyazov and to lobby\ncertain figures in France. Squarcini confirmed to Mediapa

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
from unsloth import is_bfloat16_supported, unsloth_train

# TODO: adjust learning parameters! add early stopping or sth as it started overfitting!
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,  # simulates larger batch size without increasing memory usage
    warmup_steps=5,

    num_train_epochs=3,  # default; anything more than 3 is not optimal
    max_steps=500,  # 60  TODO: for full run comment this and use only num_train_epochs
    learning_rate=2e-4,  # TODO: try 1e-4, 2e-5 or 5e-5
    eval_strategy="steps",
    eval_steps=50,
    save_steps=50,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,

    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,

    # logging_steps=10,
    output_dir="./llama_results",
    report_to = "none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # TODO
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=1,
    packing=False,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# trainer_stats = trainer.train()  # buggy gradient accumulation
trainer_stats = unsloth_train(trainer)
# trainer.evaluate()

model.save_pretrained("./llama_finetuned", tokenizer, quantization_method="f16")

Unsloth: Tokenizing ["text"]:   0%|          | 0/11531 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"]:   0%|          | 0/1282 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 11,531 | Num Epochs = 1 | Total steps = 500
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,2.2322,2.212132
100,2.1491,2.183809
150,2.1853,2.163424
200,2.1278,2.150935
250,2.1675,2.137266
300,2.0581,2.127135
350,2.1693,2.118357
400,2.2098,2.112756
450,2.1789,2.108115
500,2.0587,2.105921


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
# TODO: load the model from the checkpoint!!!
# TODO: convert dataset to use max 2048 length

In [6]:
import re

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the given sentence with a plausible statement based on your knowledge of cybersecurity and APT activity. Start with the given input.", # instruction
        "the Russian hosted web site stopgeorgia.ru", # input
        "", # output - leave this blank for generation
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
for output in tokenizer.batch_decode(outputs):
    resp = output.split("### Response:")[-1].strip().replace("<|end_of_text|>", "").split("###")[0].strip()
    clean_resp = " ".join(re.split(r'(?<=[.!?])\s+', resp)[:-1]) if not resp.strip().endswith(('.', '?', '!')) else resp
    print(clean_resp)

The website is hosted by a Russian company called Digital Space. Digital Space is a web
hosting company based in Moscow that hosts a number of Russian and Georgian websites. Digital Space has been used by the Russian government for a number of years to host
websites for various Russian government agencies.


In [8]:
import re

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the given sentence with a plausible but intentionally false statement based on your knowledge of cybersecurity and APT activity. The continuation should sound realistic, but the facts must be fabricated. Start with the given input.", # instruction
        "the Russian hosted web site stopgeorgia.ru", # input
        "", # output - leave this blank for generation
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=50, use_cache=True)
for output in tokenizer.batch_decode(outputs):
    resp = output.split("### Response:")[-1].strip().replace("<|end_of_text|>", "").split("###")[0].strip()
    clean_resp = " ".join(re.split(r'(?<=[.!?])\s+', resp)[:-1]) if not resp.strip().endswith(('.', '?', '!')) else resp
    print(clean_resp)

The server used to host the site has been in operation since at least 2008 and
has hosted several other sites with the same registration information.


## MISTRAL

In [2]:
from huggingface_hub import login

with open('huggingface_token.txt', 'r') as f:
    token = f.read()

login(token)

Download the model (4-bit with LoRA)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType

model_name = "mistralai/Mistral-7B-Instruct-v0.1"
model_path = "mistral-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.save_pretrained(model_path)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

('mistral-7b-instruct\\tokenizer_config.json',
 'mistral-7b-instruct\\special_tokens_map.json',
 'mistral-7b-instruct\\tokenizer.json')

Read the model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_path = "mistral-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

Prepare the dataset

In [None]:
from datasets import load_dataset
train_dataset = load_dataset("json", data_files="data.jsonl", split="train")

def format_instruct(example):
    return {
        "input_ids": tokenizer(example["prompt"], return_tensors="pt").input_ids[0],
        "labels": tokenizer(example["prompt"] + example["completion"], return_tensors="pt").input_ids[0]
    }

train_dataset = train_dataset.map(format_instruct)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [None]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments

torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir="./mistral_finetuned",
    logging_dir="./mistral_logs",
    logging_strategy="steps",
    logging_steps=10,
    save_steps=500,
    report_to="none",              # WandB training monitoring

    per_device_train_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-4,
    bf16=True,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,  # TODO
)

trainer.train()
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")

Truncating train dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss


('fine_tuned_model\\tokenizer_config.json',
 'fine_tuned_model\\special_tokens_map.json',
 'fine_tuned_model\\tokenizer.json')

In [None]:
model = AutoModelForCausalLM.from_pretrained("fine_tuned_model")
tokenizer = AutoTokenizer.from_pretrained("fine_tuned_model")

In [None]:
device = model.device

alpaca_prompt = "### User: Skąd pochodzi APT1?\n### Assistant:"
input_ids = tokenizer(alpaca_prompt, return_tensors="pt").input_ids.to(device)

output_ids = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  return fn(*args, **kwargs)


### User: Skąd pochodzi APT1?
### Assistant: APT1 pochodzi z Brazylii.


In [None]:
model_orig = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1").to(device)
output_ids_orig = model_orig.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(output_ids_orig[0], skip_special_tokens=True))