In [1]:
from clearml import Task, OutputModel

project_name = 'cnasg-tk/CustomerSupport'
task_name = "04-merge-lora-adapter-with-base-model"
s3_base_bucket_loc = 's3://tk-aip/clearml'
base_model = "/mnt/shared/models/huggingface/Llama-3.2-1B-Instruct"

task = Task.init(project_name=project_name, task_name=task_name, output_uri=s3_base_bucket_loc)
#Task.add_requirements("./requirements.txt")
#task.set_base_docker(docker_image="nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04")
#task.execute_remotely(queue_name="q-group-a-gpu-10gb")

ClearML Task: created new task id=6fb9d528578a453882d2b3f77f0c9bb1
2025-07-26 04:00:59,785 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clearml.bizilife.net/projects/d4b25b0d2de64d6bb07fbca370f0439e/experiments/6fb9d528578a453882d2b3f77f0c9bb1/output/log
ClearML results page: https://app.clearml.bizilife.net/projects/d4b25b0d2de64d6bb07fbca370f0439e/experiments/6fb9d528578a453882d2b3f77f0c9bb1/output/log
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start


## 1. Download LoRA Adapter from the Previous ClearML Task

In [2]:
previousTaskId = task.get_parameter("General/previous_task_id")
if (previousTaskId is not None):
    print("General/previous_task_id is available, previousTaskId=" + previousTaskId)
    previousTask = Task.get_task(task_id = previousTaskId)
else:
    previous_task_name = "03-fine-tune-llama3-2-1b"
    print("General/previous_task_id is NOT available, use previous_task_name=" + previous_task_name)
    previousTask = Task.get_task(project_name=project_name, task_name=previous_task_name)
    
if (previousTask is not None):
    models = previousTask.get_models().get("output")
    for model in models:
        print(model.name)
        if (model.name=="remote_lora_adapter_zipped"):
        #if (model.name.startswith("checkpoint-")):
            lora_adapter_loc = model.get_local_copy(extract_archive=True)
            print("lora_adapter_loc = " + lora_adapter_loc)
            break

General/previous_task_id is NOT available, use previous_task_name=03-fine-tune-llama3-2-1b
03-fine-tune-llama3-2-1b - training_args
03-fine-tune-llama3-2-1b - optimizer
03-fine-tune-llama3-2-1b - scheduler
03-fine-tune-llama3-2-1b - rng_state
checkpoint-450
remote_lora_adapter_loc
remote_lora_adapter_zipped
2025-07-26 04:01:02,447 - clearml.storage - INFO - Downloading: 43.03MB from s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/remote_lora_adapter_zipped.zip


███████████████████████████████ 100% | 43.03/43.03 MB [00:01<00:00, 40.43MB/s]: 

2025-07-26 04:01:03,521 - clearml.storage - INFO - Downloaded 43.03 MB successfully from s3://tk-aip/clearml/cnasg-tk/CustomerSupport/03-fine-tune-llama3-2-1b.4f13e3616926496281180d1f83ddfa6a/models/remote_lora_adapter_zipped.zip , saved to /root/.clearml/cache/storage_manager/global/5b55e4e1efc27a6861d45f22694b7a8d.remote_lora_adapter_zipped.zip
lora_adapter_loc = /root/.clearml/cache/storage_manager/global/5b55e4e1efc27a6861d45f22694b7a8d.remote_lora_adapter_zipped_artifacts_archive_None





## 2. Merge Base Model wit LoRA Adapter 

In [3]:
base_model = "/mnt/shared/models/huggingface/Llama-3.2-1B-Instruct"
#lora_adapter_loc = "output/lora_adapter"
merged_model_loc = "output/merged_model"
remote_merged_model_loc = "/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice"

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload= AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [5]:
# Merge adapter with base model
#base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(base_model_reload, lora_adapter_loc)
model = model.merge_and_unload()

## 3. Merged-Model Inference

In [6]:
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I have to see what payment payment modalities are accepted"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.




We appreciate your inquiry about our payment options! We strive to provide a seamless and convenient payment experience for our valued customers. Our accepted payment modalities include:

- Credit/Debit Card: We accept major credit and debit cards such as Visa, Mastercard, and American Express.
- PayPal: A widely recognized and trusted online payment method.
- Bank Transfer: Direct bank transfers to our account for secure and efficient payments.
- Apple Pay: For Apple users, we offer a convenient and secure way to make payments.

These payment modalities are available at our website and at our physical locations. If you have any specific questions or need further assistance with a particular payment method, please feel free to let us know. We're here to help!


## 3. Save the tokenizer and model locally.

In [7]:
model.save_pretrained(merged_model_loc)
tokenizer.save_pretrained(merged_model_loc)

('output/merged_model/tokenizer_config.json',
 'output/merged_model/special_tokens_map.json',
 'output/merged_model/chat_template.jinja',
 'output/merged_model/tokenizer.json')

In [9]:
model.save_pretrained(remote_merged_model_loc)
tokenizer.save_pretrained(remote_merged_model_loc)

('/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice/tokenizer_config.json',
 '/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice/special_tokens_map.json',
 '/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice/chat_template.jinja',
 '/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice/tokenizer.json')

In [10]:
#lora_adapter = task_name

#from clearml import Task, OutputModel
# Upload the merged model to S3 bucket
clearmlModel = OutputModel(
    task=Task.current_task(),
    framework="PyTorch",
    name="llama-3.2-1b-instruct-customerservice"
)
clearmlModel.update_weights_package(
    weights_path=remote_merged_model_loc,
    target_filename="merge_model_zipped",
    auto_delete_file=False
)

2025-07-26 04:20:34,199 - clearml.storage - INFO - Starting upload: /tmp/model_package.irarap8l.zip => tk-aip/clearml/cnasg-tk/CustomerSupport/04-merge-lora-adapter-with-base-model.6fb9d528578a453882d2b3f77f0c9bb1/models/merge_model_zipped.zip


's3://tk-aip/clearml/cnasg-tk/CustomerSupport/04-merge-lora-adapter-with-base-model.6fb9d528578a453882d2b3f77f0c9bb1/models/merge_model_zipped.zip'

2025-07-26 04:20:34,690 - clearml.storage - INFO - Uploading: 2373.61MB from /tmp/model_package.irarap8l.zip


                                   0% | 2489.61/? MB [00:22<00:00, 110.57MB/s]: 

2025-07-26 04:20:57,211 - clearml.Task - INFO - Completed model upload to s3://tk-aip/clearml/cnasg-tk/CustomerSupport/04-merge-lora-adapter-with-base-model.6fb9d528578a453882d2b3f77f0c9bb1/models/merge_model_zipped.zip





In [None]:
task.close()