### Set the variables for base model, dataset, and new model name

In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
checkpoint_loc = "output/checkpoints"
lora_adapter_loc = "output/lora_adapter"
#dataset_name = "bitext/Bitext-customer-support-llm-chatbot-training-dataset"

In [3]:
%env CLEARML_LOG_MODEL=True
from clearml import Task, OutputModel, Dataset

project_name = 'cnasg-tk/CustomerSupport'
task_name = "03-fine-tune-llama3-2-1b"
s3_base_bucket_loc = 's3://tk-aip/clearml'
base_model = "/mnt/shared/models/huggingface/Llama-3.2-1B-Instruct"

task = Task.init(project_name=project_name, task_name=task_name, output_uri=s3_base_bucket_loc)
#Task.add_requirements("./requirements.txt")
#task.set_parameter("base-model",base_model)
#task.set_parameter("lora-adapter",task_name)
#task.set_base_docker(docker_image="nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04")
##task.set_base_docker(docker_image="nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04")
#task.execute_remotely(queue_name="q-group-a-gpu-10gb")

env: CLEARML_LOG_MODEL=True
ClearML Task: created new task id=4f13e3616926496281180d1f83ddfa6a
2025-07-26 03:54:53,562 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clearml.bizilife.net/projects/d4b25b0d2de64d6bb07fbca370f0439e/experiments/4f13e3616926496281180d1f83ddfa6a/output/log


### Load the Python packages and functions we will use throughout the fine-tuning and evaluation process.

In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

wandb.init(mode="disabled")

## 2. Loading the model and tokenizer

### Set the data type and attention implementation

In [5]:
# Set torch dtype and attention implementation
#if torch.cuda.get_device_capability()[0] >= 8:
#    print("install flash-attn!!!")
##    !pip install -qqq flash-attn
##    !pip install flash-attn --no-build-isolation
#    torch_dtype = torch.bfloat16
#    attn_implementation = "flash_attention_2"
#else:
#    torch_dtype = torch.float16
#    attn_implementation = "eager"

torch_dtype = torch.float16
attn_implementation = "eager"

### Load the model and tokenizer by providing the local model directory, we will load the model in 4-bit quantization.

In [6]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

## 3. Loading and processing the dataset

In [7]:
cml_dataset_project = "cnasg-tk/CustomerSupport"
cml_dataset_name = "bitext-customer-support"
cml_dataset = Dataset.get(
        dataset_project=cml_dataset_project,
        dataset_name=cml_dataset_name,
        only_completed=True,
        only_published=False,
)
dataset_path = cml_dataset.get_local_copy()
files = cml_dataset.list_files()
data_file = dataset_path + "/" + files[0]

2025-07-26 03:55:02,708 - clearml - INFO - Dataset.get() did not specify alias. Dataset information will not be automatically logged in ClearML Server.


In [8]:
#Importing the dataset
#dataset = load_dataset(dataset_name, split="train")
dataset = load_dataset("csv", data_files=data_file, split="train")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["instruction"]},
               {"role": "assistant", "content": row["response"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= 4,
)

In [9]:
dataset['text'][3]
dataset = dataset.train_test_split(test_size=0.1)

## 4. Setting up the model

### Extract the linear model name from the model

In [10]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
#modules = ['gate_proj', 'v_proj', 'up_proj', 'q_proj', 'down_proj', 'k_proj', 'o_proj']

### Use the linear module name to create the LoRA adopter. We will only fine-tune the LoRA adopter and leave the rest of the model to save memory and for faster training time

In [11]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
#model, tokenizer = setup_chat_format(model, tokenizer)
tokenizer.pad_token = tokenizer.eos_token

model = get_peft_model(model, peft_config)

In [12]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=checkpoint_loc,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
#    report_to="wandb"
)

### We will now set up a supervised fine-tuning (SFT) trainer and provide a train and evaluation dataset, LoRA configuration, training argument, tokenizer, and model. 

In [13]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
#    max_seq_length= 512,
#    dataset_text_field="text",
#    tokenizer=tokenizer,
    args=training_arguments,
#    packing= False,
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


## 5. Model training

In [14]:
trainer.train()





Step,Training Loss,Validation Loss
90,0.8146,0.835515
180,0.8976,0.744222
270,0.7039,0.699311
360,0.5958,0.669215
450,0.5369,0.646707


2025-07-26 03:57:36,108 - clearml.storage - INFO - Starting upload: /tmp/.clearml.upload_model_z_u3ft_z.tmp => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/training_args.bin
2025-07-26 03:57:36,358 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/training_args.bin
2025-07-26 03:57:36,774 - clearml.storage - INFO - Starting upload: /tmp/.clearml.upload_model_sk8q64k8.tmp => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/optimizer.pt
2025-07-26 03:57:36,943 - clearml.storage - INFO - Uploading: 86.13MB from /tmp/.clearml.upload_model_sk8q64k8.tmp


                                           0% | 0.00/86.13 MB [00:00<?, ?MB/s]: 

2025-07-26 03:57:37,608 - clearml.Task - INFO - Waiting for previous model to upload (2 pending, s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/rng_state.pth)


███████████████████████████████ 100% | 86.13/86.13 MB [00:01<00:00, 44.08MB/s]: 

2025-07-26 03:57:38,905 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/optimizer.pt
2025-07-26 03:57:39,025 - clearml.storage - INFO - Starting upload: /tmp/.clearml.upload_model_n13k5ro3.tmp => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/scheduler.pt
2025-07-26 03:57:39,073 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/scheduler.pt





2025-07-26 03:57:39,190 - clearml.storage - INFO - Starting upload: /tmp/.clearml.upload_model_rho2dj49.tmp => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/rng_state.pth
2025-07-26 03:57:39,237 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/rng_state.pth
2025-07-26 03:58:08,497 - clearml.storage - INFO - Starting upload: /tmp/model_package.1mn0rary.zip => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/checkpoint-450.zip


TrainOutput(global_step=450, training_loss=0.7906042362584008, metrics={'train_runtime': 183.478, 'train_samples_per_second': 4.905, 'train_steps_per_second': 2.453, 'total_flos': 1052669993852928.0, 'train_loss': 0.7906042362584008})

2025-07-26 03:58:08,650 - clearml.storage - INFO - Uploading: 145.76MB from /tmp/model_package.1mn0rary.zip


█████████████████████████████ 100% | 145.76/145.76 MB [00:02<00:00, 61.28MB/s]: 

2025-07-26 03:58:11,033 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/checkpoint-450.zip





2025-07-26 03:58:16,280 - clearml.storage - INFO - Starting upload: /tmp/model_package.iq4_h09o.zip => tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/remote_lora_adapter_zipped.zip
2025-07-26 03:58:16,356 - clearml.storage - INFO - Uploading: 43.03MB from /tmp/model_package.iq4_h09o.zip
2025-07-26 03:58:16,359 - clearml.storage - INFO - Uploading: 43.03MB from /tmp/model_package.iq4_h09o.zip
2025-07-26 03:58:16,363 - clearml.storage - INFO - Uploading: 6.00MB / 43.03MB @ 77.56MBs from /tmp/model_package.iq4_h09o.zip


█████████████████████████████▌   95% | 41.03/43.03 MB [00:00<00:00, 61.64MB/s]: 


## 6. Model Inference

### To test the fine-tuned model, we will provide it with the sample prompt from the dataset

In [15]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




I've taken note that you have bought the same item twice and would like to cancel order {{Order Number}}. I understand that this may have caused inconvenience, and I'm here to assist you in resolving this matter. To cancel your order, I recommend reaching out to our customer support team. They will be able to provide you with the necessary instructions and guidance to cancel your order successfully. In the meantime, I encourage you to review the terms and conditions of your purchase and contact our customer support if you have any questions or concerns. We value your satisfaction and are committed to resolving this issue for you.


## 7. Saving the tokenizer and model

### Output LoRA Adapter to local folder

In [16]:
model.save_pretrained(lora_adapter_loc)

### Upload LoRA Adapter in Zipped format to S3 Bucket via ClearML

In [17]:
from clearml import Task, OutputModel, StorageManager

s3_bucket_loc = s3_base_bucket_loc + "/" + Task.current_task().get_project_name() + "/" + task_name + "." + task.id + "/models/lora_adapter"
StorageManager.upload_folder(lora_adapter_loc, s3_bucket_loc)
clearmlModel = OutputModel(
    task=Task.current_task(),
    framework="PyTorch",
    name="remote_lora_adapter_loc"
)
clearmlModel.update_weights(
    register_uri=s3_bucket_loc,
    target_filename = "remote_lora_adapter_loc"
)

2025-07-26 03:58:14,643 - clearml.storage - INFO - Uploading: 43.03MB from output/lora_adapter/adapter_model.safetensors
2025-07-26 03:58:14,646 - clearml.storage - INFO - Uploading: 43.03MB from output/lora_adapter/adapter_model.safetensors


██████████████████████████████▎  98% | 42.03/43.03 MB [00:00<00:00, 61.51MB/s]: 


In [18]:
#lora_adapter = task_name

#from clearml import Task, OutputModel
# Upload the merged model to S3 bucket
clearmlModel = OutputModel(
    task=Task.current_task(),
    framework="PyTorch",
    name="remote_lora_adapter_zipped"
)
clearmlModel.update_weights_package(
    weights_path=lora_adapter_loc,
    target_filename="remote_lora_adapter_zipped",
    auto_delete_file=False
)

's3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/remote_lora_adapter_zipped.zip'

In [19]:
task.close()