# Fine tune model

In [1]:
%%capture
!pip install https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post9/triton-3.1.0-cp312-cp312-win_amd64.whl 

In [2]:
from unsloth import FastLanguageModel
from src.paths import FINAL_VULNERABILITIES_DATA_PATH
import torch
from datasets import Dataset
import polars as pl

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B"

max_seq_length = 8192  # Choose any! We auto support ROPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.1.8: Fast Qwen2 patching. Transformers: 4.48.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabiliged LoRA
    loftq_config=None,  # And LoftQ
    lora_alpha=16,
    # Supports any, but = "none" is optimized
)

Unsloth 2025.1.8 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [5]:
alpaca_prompt = """### Instruction:
You are a cybersecurity expert specialized in vulnerability detection. Your task is to analyze the provided source code and determine whether it contains any security vulnerabilities. If vulnerabilities are found, classify them by CWE ID.

### Input:
The following source code is provided for analysis:

---
{}
---

### Instruction:
State whether any vulnerabilities are present. If vulnerabilities exist, list the corresponding CWE IDs.

#### Example Output 1 (No vulnerabilities found):
No security vulnerabilities detected.

#### Example Output 2 (Vulnerabilities found):
Security vulnerabilities detected: CWE-1, CWE-2, ... .

Provide only the response without any additional explanation.

### Output:
{}
"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(row):
    codes = row["code"]
    files = row["file"]
    is_vulnerability_exists = row["is_vulnerability_exists"]
    cwe = row["clustered_cwe_id"]
    if is_vulnerability_exists[0]:
        cwes = set(_cwe for cwe_list in cwe for _cwe in cwe_list)
        output = f"Security vulnerabilities detected: {', '.join(cwes)}."
    else:
        output = "No security vulnerabilities detected."

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\n```python\n{code_unit.replace('\n\n', '\n')}\n```\n"
        )
    return {"text": alpaca_prompt.format(input_data, output) + EOS_TOKEN}



code_without_vulnerabilities = pl.read_parquet(FINAL_VULNERABILITIES_DATA_PATH).drop(
    "repo", "patch"
)
code_with_vulnerabilites = code_without_vulnerabilities.clone()

code_with_vulnerabilites = (
    (
        code_with_vulnerabilites.drop("code_unit_after_fix", "new_file")
        .rename({"code_unit_before_fix": "code", "old_file": "file"})
        .with_columns(pl.lit(True).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "clustered_cwe_id")
)
code_without_vulnerabilities = (
    (
        code_without_vulnerabilities.drop("code_unit_before_fix", "old_file")
        .rename({"code_unit_after_fix": "code", "new_file": "file"})
        .with_columns(pl.lit(False).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "clustered_cwe_id")
)

vulnerability_dataset = pl.concat(
    [code_with_vulnerabilites, code_without_vulnerabilities]
)


vulnerability_dataset = vulnerability_dataset.sample(fraction=1, shuffle=True)
dataset = Dataset.from_polars(vulnerability_dataset)
dataset = dataset.map(
    formatting_prompts_func,
)

train_valid = dataset.train_test_split(test_size=0.15)
train_dataset = train_valid["train"]
temp_dataset = train_valid["test"]

# eval_test = temp_dataset.train_test_split(test_size=0.5)
# evaluation_dataset = eval_test["train"]
# test_dataset = eval_test["test"]

Map:   0%|          | 0/2926 [00:00<?, ? examples/s]

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    # eval_dataset=evaluation_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps=60,
        learning_rate=5e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        # eval_accumulation_steps=4,
        # eval_strategy="steps",
        # eval_steps=100,
    ),
)

Map (num_proc=2):   0%|          | 0/2926 [00:00<?, ? examples/s]

In [7]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3060. Max memory = 12.0 GB.
5.961 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 2,926 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 365
 "-____-"     Number of trainable parameters = 161,480,704


Step,Training Loss
10,1.1976
20,1.1535
30,1.0998
40,0.9611
50,0.7852
60,0.601
70,0.6015
80,0.5743
90,0.6121
100,0.5748


In [9]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

4640.2961 seconds used for training.
77.34 minutes used for training.
Peak reserved memory = 11.916 GB.
Peak reserved memory for training = 5.955 GB.
Peak reserved memory % of max memory = 99.3 %.
Peak reserved memory for training % of max memory = 49.625 %.


In [10]:
model.save_pretrained("pretrained_vulnerability_searcher")
tokenizer.save_pretrained("pretrained_vulnerability_searcher")

# model.save_pretrained_merged("pretrained_vulnerability_searcher_4_bit", tokenizer=tokenizer, save_method="merged_4bit")
model.save_pretrained_merged("pretrained_vulnerability_searcher_16_bit", tokenizer=tokenizer, save_method="merged_16bit")
# model.save_pretrained_gguf("pretrained_vulnerability_searcher_gguf_q4_k_m", tokenizer=tokenizer, quantization_method="q4_k_m")
# model.save_pretrained_gguf("pretrained_vulnerability_searcher_gguf_f16", tokenizer=tokenizer, quantization_method="f16")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 8.34 out of 31.92 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 18%|█▊        | 5/28 [00:00<00:01, 11.60it/s]
We will save to Disk and not RAM now.
100%|██████████| 28/28 [00:22<00:00,  1.24it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
evaluation_result = trainer.evaluate()
print(evaluation_result)

In [11]:
temp_dataset.to_parquet("data/test_dataset.parquet")

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2919934

In [13]:
input_str = test_dataset.data.slice(0)["text"][0].as_py()

In [None]:
from pathlib import Path
import importlib

import transformers

importlib.reload(transformers)

from transformers import AutoModelForCausalLM, AutoTokenizer


PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Input is a python file name and a part of python code with possible software vulnerabilities.
Review this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.

Input will be provided in format:
```
python/code/file1.py
Code:
import sys

print("hello world!")

python/code/file2.py
Code:
import os

print("hello world from another file!")
```


### Input:
```
{}
```

### Response:"""

def formatting_input(row):
    codes = row["code"]
    files = row["file"]

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\nCode: \n{code_unit.replace('\n\n', '\n')}\n\n"
        )
    return {"text": alpaca_prompt.format(input_data) + EOS_TOKEN}

model = AutoModelForCausalLM.from_pretrained(Path("vulnerability_searcher"),device_map="cuda", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(Path("vulnerability_searcher"),device_map="cuda", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

input_str = validation_dataset.data.slice(0)["text"][0].as_py()
model_inputs = tokenizer([input_str[:input_str.index("\n\n### Response:\n")]], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
  f"O^O/ \_/ \\    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
  f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)


AttributeError: 'Qwen2ForCausalLM' object has no attribute 'max_seq_length'

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
results = []
sub = """### Output:
"""

for input in test_dataset:
    input_str = test_dataset.data.slice(0)["text"][2].as_py()
    content = input_str[:input_str.index(sub)]
    expected = input_str[input_str.index(sub):]

    inputs = tokenizer(
        content,
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True, temperature = 1.5, min_p = 0.1)
    print(expected)
    result = tokenizer.batch_decode(outputs)
    print(result[0])
    

Provide only the response without any additional explanation.

No security vulnerabilities detected.
<|endoftext|>
### Instruction:
You are a cybersecurity expert specialized in vulnerability detection. Your task is to analyze the provided source code and determine whether it contains any security vulnerabilities. If vulnerabilities are found, classify them by CWE ID.

### Input:
The following source code is provided for analysis:

---
File name: wagtail/admin/compare.py
```python
def text_from_html(val):
    return BeautifulSoup(force_str(val), "html5lib").getText()
class RichTextFieldComparison(TextFieldComparison):
    def htmldiff(self):
        return diff_text(
            text_from_html(self.val_a), text_from_html(self.val_b)
        ).to_html()
def get_comparison_class_for_block(block):
    if hasattr(block, "get_comparison_class"):
        return block.get_comparison_class()
    elif isinstance(block, (blocks.CharBlock, blocks.TextBlock)):
        return CharBlockComparison
  

KeyboardInterrupt: 