# LoRA Fine tuning of a coder model

### Mount Google Drive for data retrival and storage

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [26]:
!pip install --upgrade pip
!pip install torch transformers accelerate bitsandbytes datasets peft pandas



In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
from peft import PeftModel, LoraConfig, get_peft_model
import json
import os
import random
import pandas as pd

## Config

In [6]:
# In real application use coder Qwen base model
CODER_MODEL = "Qwen/Qwen2.5-Coder-7B-Instruct"
# To test the code use a small model
TEST_MODEL = "Qwen/Qwen1.5-0.5B"

# IMPORTANT: target_modules is model specific, adjust based on the model
TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

BASE_MODEL = TEST_MODEL
DATASET_PATH = '/content/drive/MyDrive/custom_coder_dataset/code_dataset.csv'
PREPARED_DATASET_PATH = '/content/drive/MyDrive/custom_coder_dataset/code_dataset.jsonl'
OUTPUT_DIR = "/content/drive/MyDrive/custom_coder_lora_adapter_v1"
MERGED_DIR = "/content/drive/MyDrive/custom_coder_v1"

# IMPORTANT: MAX_LENGTH is a low value for testing only
# use larger number based on the example code snipet sizes
# e.g. 1024, 2048
MAX_LENGTH = 128

# pytorch CrossEntropyLoss param
# framework specific not model/tuning hyperparam
IGNORE_INDEX = -100

# High BATCH_SIZE increases memory need
# Use GRADIENT_ACCUMULATION_STEPS to keep memory usage lower
# but do not sacrifice training by running multiple gradient accumulation steps before optimizer step
BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 4

EPOCHS = 3
LEARNING_RATE = 2e-4
FP16 = True

LORA_R = 16
LORA_ALPHA = 32
LORA_DROPOUT = 0.05

MAX_NEW_TOKENS = 400
TEMPERATURE = 0.2
TOP_P = 0.9
TOP_K = 50

## Convert csv dataset to jsonl

In [29]:
def prepare_dataset(csv_path, output_path, scaling_factor = 50):
    data = pd.read_csv(csv_path)
    data["json"] = [json.dumps({'inst': row.instruction, 'out': row.output}) for row in data.itertuples(index=False)]
    dataset = list(data["json"]) * scaling_factor
    random.shuffle(dataset)
    with open(output_path, "w") as f:
        f.writelines([line + '\n' for line in dataset])

prepare_dataset(DATASET_PATH, PREPARED_DATASET_PATH)

## Tokenize the dataset

In [6]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Add special tokens for instruction
tokenizer.add_special_tokens({"additional_special_tokens": ["[INST]", "[/INST]"]})

def tokenize(data):
    prompt = f"[INST] {data['inst']} [/INST]"
    full_example = prompt + data['out']
    tokens = tokenizer(full_example, truncation=True, max_length=MAX_LENGTH, padding="max_length")
    # We mask instructions (do not want the model to predict instructions only the answers)
    # tokens["input_ids"] passed to the model during forward pass
    # labels (masked instruction) used for loss calculation (see HuggingFace class transformers.Trainer)
    labels = tokens["input_ids"].copy()
    prompt_len = len(tokenizer(prompt)["input_ids"])
    labels[:prompt_len] = [IGNORE_INDEX] * prompt_len  # mask instruction
    tokens["labels"] = labels
    return tokens


ds = load_dataset("json", data_files={"train": DATASET_PATH})["train"]
ds = ds.map(tokenize, batched=False, remove_columns=ds.column_names)
ds.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

## LoRA configuration

In [7]:
lora_config = LoraConfig(
    r=LORA_R,
    target_modules=TARGET_MODULES,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

trainable params: 7,569,408 || all params: 471,557,120 || trainable%: 1.6052


## Training arguments

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    num_train_epochs=EPOCHS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False,
    push_to_hub=False,
    report_to=None,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Train the model

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
    data_collator=data_collator,
)
trainer.train()

os.makedirs(OUTPUT_DIR, exist_ok=True)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Step,Training Loss
10,1.0866
20,0.2456
30,0.1855
40,0.1733
50,0.1814
60,0.1662
70,0.1758
80,0.168
90,0.1603


('/content/drive/MyDrive/custom_coder_lora_adapter_v1/tokenizer_config.json',
 '/content/drive/MyDrive/custom_coder_lora_adapter_v1/special_tokens_map.json',
 '/content/drive/MyDrive/custom_coder_lora_adapter_v1/chat_template.jinja',
 '/content/drive/MyDrive/custom_coder_lora_adapter_v1/vocab.json',
 '/content/drive/MyDrive/custom_coder_lora_adapter_v1/merges.txt',
 '/content/drive/MyDrive/custom_coder_lora_adapter_v1/added_tokens.json')

## Merge the adapter with the base model

In [16]:
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)
adapter_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
merged_model = adapter_model.merge_and_unload()
os.makedirs(MERGED_DIR, exist_ok=True)
merged_model.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

('/content/drive/MyDrive/custom_coder_v1/tokenizer_config.json',
 '/content/drive/MyDrive/custom_coder_v1/special_tokens_map.json',
 '/content/drive/MyDrive/custom_coder_v1/chat_template.jinja',
 '/content/drive/MyDrive/custom_coder_v1/vocab.json',
 '/content/drive/MyDrive/custom_coder_v1/merges.txt',
 '/content/drive/MyDrive/custom_coder_v1/added_tokens.json')

## Test the fine-tuned model

In [7]:
class FineTunedInference:
  def __init__(self):
    self.model = AutoModelForCausalLM.from_pretrained(MERGED_DIR, device_map="auto", torch_dtype="auto")
    self.tokenizer = AutoTokenizer.from_pretrained(MERGED_DIR, use_fast=False)
    if self.tokenizer.pad_token is None:
      self.tokenizer.pad_token = self.tokenizer.eos_token

  def completition(self, prompt, max_new_tokens=400, temperature=0.2, top_p=0.9):
    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
    with torch.no_grad():
      outputs = self.model.generate(**inputs, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p)
    return self.tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)

inference = FineTunedInference()

In [10]:
prompt = "Create a Rust function named 'add' that takes 2 f32 arguments a,b and prints the sum of the arguments."
result = inference.completition(prompt)

print('\nResult:')
print(result)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.



Result:
 ,msg := a + b; println!("{}", msg);
