In [None]:
from clearml import Task, OutputModel

project_name = 'userxx/CustomerSupport'
task_name = "04-merge-lora-adapter-with-base-model"
s3_base_bucket_loc = 's3://tk-aip/clearml'
base_model = "/mnt/shared/models/huggingface/Llama-3.2-1B-Instruct"

task = Task.init(project_name=project_name, task_name=task_name, output_uri=s3_base_bucket_loc)
#Task.add_requirements("./requirements.txt")
#task.set_base_docker(docker_image="nvidia/cuda:12.4.1-cudnn-runtime-ubuntu22.04")
#task.execute_remotely(queue_name="q-group-a-gpu-10gb")

## 1. Download LoRA Adapter from the Previous ClearML Task

In [None]:
previousTaskId = task.get_parameter("General/previous_task_id")
if (previousTaskId is not None):
    print("General/previous_task_id is available, previousTaskId=" + previousTaskId)
    previousTask = Task.get_task(task_id = previousTaskId)
else:
    previous_task_name = "03-fine-tune-llama3-2-1b"
    print("General/previous_task_id is NOT available, use previous_task_name=" + previous_task_name)
    previousTask = Task.get_task(project_name=project_name, task_name=previous_task_name)
    
if (previousTask is not None):
    models = previousTask.get_models().get("output")
    for model in models:
        print(model.name)
        if (model.name=="remote_lora_adapter_zipped"):
        #if (model.name.startswith("checkpoint-")):
            lora_adapter_loc = model.get_local_copy(extract_archive=True)
            print("lora_adapter_loc = " + lora_adapter_loc)
            break

## 2. Merge Base Model wit LoRA Adapter 

In [None]:
base_model = "/mnt/shared/models/huggingface/Llama-3.2-1B-Instruct"
#lora_adapter_loc = "output/lora_adapter"
merged_model_loc = "output/merged_model"
remote_merged_model_loc = "/mnt/shared/models/fine-tuned/merged/llama-3.2-1b-instruct-customerservice"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from peft import PeftModel
import torch
from trl import setup_chat_format
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(base_model)

base_model_reload= AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [None]:
# Merge adapter with base model
#base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
tokenizer.pad_token = tokenizer.eos_token

model = PeftModel.from_pretrained(base_model_reload, lora_adapter_loc)
model = model.merge_and_unload()

## 3. Merged-Model Inference

In [None]:
instruction = """You are a top-rated customer service agent named John. 
    Be polite to customers and answer all their questions.
    """
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": "I have to see what payment payment modalities are accepted"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text.split("assistant")[1])

## 3. Save the tokenizer and model locally.

In [None]:
model.save_pretrained(merged_model_loc)
tokenizer.save_pretrained(merged_model_loc)

In [None]:
model.save_pretrained(remote_merged_model_loc)
tokenizer.save_pretrained(remote_merged_model_loc)

In [None]:
#lora_adapter = task_name

#from clearml import Task, OutputModel
# Upload the merged model to S3 bucket
clearmlModel = OutputModel(
    task=Task.current_task(),
    framework="PyTorch",
    name="llama-3.2-1b-instruct-customerservice"
)
clearmlModel.update_weights_package(
    weights_path=remote_merged_model_loc,
    target_filename="merge_model_zipped",
    auto_delete_file=False
)

In [None]:
task.close()