# Fine tune model

In [1]:
%%capture
!pip install https://github.com/woct0rdho/triton-windows/releases/download/v3.1.0-windows.post9/triton-3.1.0-cp312-cp312-win_amd64.whl 

In [1]:
from unsloth import FastLanguageModel
import torch
import polars as pl

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B"

max_seq_length = 2048  # Choose any! We auto support ROPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

ConnectionError: (ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 0795956f-c993-4c6e-aa3d-390151ed90c8)')

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=64,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabiliged LoRA
    loftq_config=None,  # And LoftQ
    lora_alpha=16,
    # Supports any, but = "none" is optimized
)

Unsloth 2025.1.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [18]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Input is a python file name and a part of python code with possible software vulnerabilities.
Review this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.

Input will be provided in format:
```
python/code/file1.py
Code:
import sys

print("hello world!")

python/code/file2.py
Code:
import os

print("hello world from another file!")
```


### Input:
```
{}
```

### Response:
{}"""

PYTHON_FIXES_CODE_PATH = "data\python_vulnerability_fixes_code_unit_changes.parquet"

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(row):
    codes = row["code"]
    files = row["file"]
    is_vulnerability_exists = row["is_vulnerability_exists"]
    cwe = row["cwe_id"]
    texts = []
    if is_vulnerability_exists[0]:
        cwes = set(_cwe for cwe_list in cwe for _cwe in cwe_list)
        output = f"Found vulnerabilities: {', '.join(cwes)}."
    else:
        output = "No vulnerabilities found."

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\nCode: \n{code_unit.replace('\n\n', '\n')}\n\n"
        )
    return {"text": alpaca_prompt.format(input_data, output) + EOS_TOKEN}
    texts.append(text)
    return {
        "text": texts,
    }


pass

from datasets import Dataset


code_without_vulnerabilities = pl.read_parquet(PYTHON_FIXES_CODE_PATH).drop(
    "repo", "patch"
)
code_with_vulnerabilites = code_without_vulnerabilities.clone()
# TO BE DELETED
excluded_vulns = [
    "2024-34702",
    "2022-29198",
    "2022-41909",
    "2022-41891",
    "2016-0740",
    "2024-21485",
    "2024-8948",
    "2018-10861",
    "2022-4526",
    "2022-31116",
    "2023-52266",
    "2021-28359",
    "2022-25882",
    "2024-32979",
]

vulns = (
    pl.read_parquet("data\\python_vulnerability_fixes.parquet")
    .unique(["commit", "vulnerability_id", "cwe_id"])
    .select(["commit", "vulnerability_id", "cwe_id"])
    .drop_nulls()
    .filter(pl.col("vulnerability_id").is_in(excluded_vulns).not_())
)
code_with_vulnerabilites = code_with_vulnerabilites.join(vulns, on="commit")
code_without_vulnerabilities = code_without_vulnerabilities.join(vulns, on="commit")
###

code_with_vulnerabilites = (
    (
        code_with_vulnerabilites.drop("code_unit_after_fix", "new_file")
        .rename({"code_unit_before_fix": "code", "old_file": "file"})
        .with_columns(pl.lit(True).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "cwe_id")
)
code_without_vulnerabilities = (
    (
        code_without_vulnerabilities.drop("code_unit_before_fix", "old_file")
        .rename({"code_unit_after_fix": "code", "new_file": "file"})
        .with_columns(pl.lit(False).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "cwe_id")
)

vulnerability_dataset = pl.concat(
    [code_with_vulnerabilites, code_without_vulnerabilities]
)


vulnerability_dataset = vulnerability_dataset.sample(fraction=1, shuffle=True)
dataset = Dataset.from_pandas(vulnerability_dataset.to_pandas(), split="train[:90%]")
validation_dataset = Dataset.from_pandas(vulnerability_dataset.to_pandas(), split="train[90%:95%]")
test_dataset = Dataset.from_pandas(vulnerability_dataset.to_pandas(), split="train[95%:]")

dataset = dataset.map(
    formatting_prompts_func,
)
validation_dataset = validation_dataset.map(
    formatting_prompts_func,
)

  PYTHON_FIXES_CODE_PATH = "data\python_vulnerability_fixes_code_unit_changes.parquet"


Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

In [21]:
dataset = dataset.shuffle(seed=42).select(range(1000))

IndexError: Index 999 out of range for dataset of size 8.

In [22]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=validation_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/8 [00:00<?, ? examples/s]

In [23]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3060. Max memory = 12.0 GB.
5.98 GB of memory reserved.


In [24]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 8 | Num Epochs = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 161,480,704


Step,Training Loss
1,1.7321
2,1.7321
3,1.7285
4,1.6951
5,1.6274
6,1.5815
7,1.5052
8,1.4157
9,1.3282
10,1.238


In [25]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

566.9628 seconds used for training.
9.45 minutes used for training.
Peak reserved memory = 8.061 GB.
Peak reserved memory for training = 2.081 GB.
Peak reserved memory % of max memory = 67.175 %.
Peak reserved memory for training % of max memory = 17.342 %.


In [None]:

model.save_pretrained_merged("pretrained_vulnerability_searcher_16_bit", tokenizer=tokenizer, save_method="merged_16bit")
model.save_pretrained_merged("pretrained_vulnerability_searcher_4_bit", tokenizer=tokenizer, save_method="merged_4bit")
model.save_pretrained_gguf("pretrained_vulnerability_searcher_gguf_q4_k_m", tokenizer=tokenizer, quantization_method="q4_k_m")
model.save_pretrained_gguf("pretrained_vulnerability_searcher_gguf_f16", tokenizer=tokenizer, quantization_method="f16")

RuntimeError: *** Unsloth: Failed compiling llama.cpp using os.system(...) with error 1. Please report this ASAP!

In [20]:
evaluation_result = trainer.evaluate()
print(evaluation_result)

Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


{'eval_loss': 0.009208951145410538, 'eval_runtime': 4.842, 'eval_samples_per_second': 1.652, 'eval_steps_per_second': 0.207, 'epoch': 60.0}


In [21]:
input_str = validation_dataset.data.slice(0)["text"][0].as_py()
input_str[:input_str.index("\n\n### Response:\n")]



In [None]:
from pathlib import Path
import importlib

import transformers

importlib.reload(transformers)

from transformers import AutoModelForCausalLM, AutoTokenizer


PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Input is a python file name and a part of python code with possible software vulnerabilities.
Review this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.

Input will be provided in format:
```
python/code/file1.py
Code:
import sys

print("hello world!")

python/code/file2.py
Code:
import os

print("hello world from another file!")
```


### Input:
```
{}
```

### Response:"""

def formatting_input(row):
    codes = row["code"]
    files = row["file"]

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\nCode: \n{code_unit.replace('\n\n', '\n')}\n\n"
        )
    return {"text": alpaca_prompt.format(input_data) + EOS_TOKEN}

model = AutoModelForCausalLM.from_pretrained(Path("vulnerability_searcher"),device_map="cuda", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(Path("vulnerability_searcher"),device_map="cuda", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

input_str = validation_dataset.data.slice(0)["text"][0].as_py()
model_inputs = tokenizer([input_str[:input_str.index("\n\n### Response:\n")]], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  f"   \\\   /|    GPU: {gpu_stats.name}. Max memory: {max_memory} GB. Platform: {platform_system}.\n"\
  f"O^O/ \_/ \\    Torch: {torch.__version__}. CUDA: {gpu_stats.major}.{gpu_stats.minor}. CUDA Toolkit: {torch.version.cuda}. Triton: {triton_version}\n"\
  f"\        /    Bfloat16 = {str(SUPPORTS_BFLOAT16).upper()}. FA [Xformers = {xformers_version}. FA2 = {HAS_FLASH_ATTENTION}]\n"\
  start = re.search('logger\.info\([\"\'].+?Running training', inner_training_loop).span(0)[0]
  spaces = re.search('\n([\s\t]{1,})', original_debug).group(0)[1:]
  front_spaces = re.match('([\s\t]{1,})', inner_training_loop).group(0)


AttributeError: 'Qwen2ForCausalLM' object has no attribute 'max_seq_length'

In [31]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

input_str = validation_dataset.data.slice(0)["text"][2].as_py()
content = input_str[:input_str.index("\n\n### Response:\n")]
expected = input_str[input_str.index("\n\n### Response:\n"):]

inputs = tokenizer(
    content,
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)
print(expected)
tokenizer.batch_decode(outputs)



### Response:
Found vulnerabilities: CWE-862.<|endoftext|>


['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nInput is a python file name and a part of python code with possible software vulnerabilities.\nReview this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.\n\nInput will be provided in format:\n```\npython/code/file1.py\nCode:\nimport sys\n\nprint("hello world!")\n\npython/code/file2.py\nCode:\nimport os\n\nprint("hello world from another file!")\n```\n\n\n### Input:\n```\nFile name: saleor\\graphql\\account\\types.py\nCode: \nclass User(ModelObjectType):\n        def _resolve_orders(orders):\n            requester = get_user_or_app_from_context(info.context)\n            if not requester.has_perm(OrderPermissions.MANAGE_ORDERS):\n                orders = list(\n                    filter(lambda order: order.