# Fine tune model

In [1]:
from unsloth import FastLanguageModel
import torch
import polars as pl

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B"

max_seq_length = 2048  # Choose any! We auto support ROPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2025.1.6: Fast Qwen2 patching. Transformers: 4.48.1.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 12.0 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


  self.register_buffer("cos_cached", emb.cos().to(dtype=dtype, device=device, non_blocking=True), persistent=False)


In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabiliged LoRA
    loftq_config=None,  # And LoftQ
    lora_alpha=16,
    # Supports any, but = "none" is optimized
)

Unsloth 2025.1.6 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Input is a python file name and a part of python code with possible software vulnerabilities.
Review this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.

Input will be provided in format:
```
python/code/file1.py
Code:
import sys

print("hello world!")

python/code/file2.py
Code:
import os

print("hello world from another file!")
```


### Input:
```
{}
```

### Response:
{}"""

PYTHON_FIXES_CODE_PATH = "data\python_vulnerability_fixes_code_unit_changes.parquet"

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN


def formatting_prompts_func(row):
    codes = row["code"]
    files = row["file"]
    is_vulnerability_exists = row["is_vulnerability_exists"]
    cwe = row["cwe_id"]
    texts = []
    if is_vulnerability_exists[0]:
        if len(cwe) == 1:
            output = f"Found vulnerability: {cwe[0]}."
        else:
            output = f"Found vulnerabilities: {', '.join(cwe)}."
    else:
        output = "No vulnerabilities found."

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\nCode: \n{code_unit.replace('\n\n', '\n')}\n\n"
        )
    return {"text": alpaca_prompt.format(input_data, output) + EOS_TOKEN}
    texts.append(text)
    return {
        "text": texts,
    }


pass

from datasets import Dataset


code_without_vulnerabilities = pl.read_parquet(PYTHON_FIXES_CODE_PATH).drop(
    "repo", "patch"
)
code_with_vulnerabilites = code_without_vulnerabilities.clone()
# TO BE DELETED
vulns = (
    pl.read_parquet("data\\python_vulnerability_fixes.parquet")
    .unique(["commit", "vulnerability_id", "cwe_id"])
    .select(["commit", "vulnerability_id", "cwe_id"])
    .drop_nulls()
)
code_with_vulnerabilites = code_with_vulnerabilites.join(vulns, on="commit")
code_without_vulnerabilities = code_without_vulnerabilities.join(vulns, on="commit")
###

code_with_vulnerabilites = (
    (
        code_with_vulnerabilites.drop("code_unit_after_fix", "new_file")
        .rename({"code_unit_before_fix": "code", "old_file": "file"})
        .with_columns(pl.lit(True).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "cwe_id")
)
code_without_vulnerabilities = (
    (
        code_without_vulnerabilities.drop("code_unit_before_fix", "old_file")
        .rename({"code_unit_after_fix": "code", "new_file": "file"})
        .with_columns(pl.lit(False).alias("is_vulnerability_exists"))
    )
    .group_by(by="vulnerability_id")
    .agg("code", "file", "is_vulnerability_exists", "cwe_id")
)

vulnerability_dataset = pl.concat(
    [code_with_vulnerabilites, code_without_vulnerabilities]
)


vulnerability_dataset = vulnerability_dataset.sample(fraction=1, shuffle=True)
dataset = Dataset.from_pandas(vulnerability_dataset.to_pandas(), split="train[:90%]")
validation_dataset = Dataset.from_pandas(vulnerability_dataset.to_pandas(), split="train[10%:]")

dataset = dataset.map(
    formatting_prompts_func,
)
validation_dataset = validation_dataset.map(
    formatting_prompts_func,
)

  PYTHON_FIXES_CODE_PATH = "data\python_vulnerability_fixes_code_unit_changes.parquet"


Map:   0%|          | 0/2448 [00:00<?, ? examples/s]

Map:   0%|          | 0/2448 [00:00<?, ? examples/s]

In [5]:
dataset = dataset.shuffle(seed=42).select(range(1000))

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=validation_dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/2448 [00:00<?, ? examples/s]

In [7]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce RTX 3060. Max memory = 12.0 GB.
5.439 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 60
 "-____-"     Number of trainable parameters = 40,370,176


Step,Training Loss
1,1.51
2,1.8551
3,1.0821
4,1.3036
5,1.7809
6,1.1275
7,1.4756
8,1.4369
9,1.3264
10,1.2862


In [9]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

729.2421 seconds used for training.
12.15 minutes used for training.
Peak reserved memory = 8.795 GB.
Peak reserved memory for training = 3.356 GB.
Peak reserved memory % of max memory = 73.292 %.
Peak reserved memory for training % of max memory = 27.967 %.


In [10]:
trainer.save_model("vulnerability_searcher")
model.save_pretrained("pretrained_vulnerability_searcher")

In [None]:
evaluation_result = trainer.evaluate()
print(evaluation_result)

Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer


PROMPT = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Input is a python file name and a part of python code with possible software vulnerabilities.
Review this code and find vulnerabilities in input code and identify the number of Common Weakness Enumeration (CWE), if vulnerability not exists, tell about it.

Input will be provided in format:
```
python/code/file1.py
Code:
import sys

print("hello world!")

python/code/file2.py
Code:
import os

print("hello world from another file!")
```


### Input:
```
{}
```

### Response:"""

def formatting_input(row):
    codes = row["code"]
    files = row["file"]

    input_data = ""
    for code_file, code_unit in zip(files, codes):
        input_data = (
            f"File name: {code_file}\nCode: \n{code_unit.replace('\n\n', '\n')}\n\n"
        )
    return {"text": alpaca_prompt.format(input_data) + EOS_TOKEN}

model = AutoModelForCausalLM.from_pretrained(Path("pretrained_vulnerability_searcher"), device_map="auto", load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(Path("pretrained_vulnerability_searcher"), padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model_inputs = tokenizer([formatting_input(validation_dataset.data.slice)], return_tensors="pt").to("cuda")
generated_ids = model.generate(**model_inputs, max_new_tokens=200)
tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]