### Install required dependencies

In [None]:
!pip install -q transformers peft accelerate bitsandbytes huggingface_hub

### Upload and Unzip the LoRA Adapter
After fine-tuning the base model, there wil be two primary files created which are used for Model Inference: `adapter_model.safetensors` and `adapter_config.json`.

Steps for the inferencing preparation:
1. **Prepare a ZIP file** named `lora_adapter.zip` that contains the following **two required files**:
    - `adapter_model.safetensors` – LoRA fine-tuned weights
    - `adapter_config.json` – Configuration file describing the adapter

2. **Upload the ZIP file** using the file upload tool in the left sidebar or run the `files.upload()` cell below.

3. The ZIP will be automatically extracted to `/content/lora_adapter/` for loading with PEFT.

In [None]:
# You can upload via the left-hand "Files" tab or use this:
from google.colab import files
uploaded = files.upload()  # upload zip or individual files

In [None]:
# If zipped then make a new directory and unzip:
!mkdir -p /content/lora_adapter
!unzip -o "*.zip" -d /content/lora_adapter

### Load the Base Model and LoRA

In [None]:
# Login to Hugging Face Hub for model access
from huggingface_hub import login

HF_TOKEN = "your_hf_token_here"
login(HF_TOKEN)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
lora_path = "/content/lora_adapter"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_id, token=HF_TOKEN)

# Load base model in 4-bit quantized mode
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_4bit=True,
    token=HF_TOKEN
)

# Load LoRA adapter
model = PeftModel.from_pretrained(base_model, lora_path)

# === 💬 STEP 5: Inference ===
prompt = "What is AgenticAI?"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

### Merge and save the merged model

In [None]:
# Merge LoRA weights into base model
merged_model = model.merge_and_unload()

# Define save path
save_path = "/content/llama3-8b-merged"

# Save merged model in safetensors format
merged_model.save_pretrained(save_path, safe_serialization=True)

# Save tokenizer
tokenizer.save_pretrained(save_path)

# Optional: Save generation config
merged_model.generation_config.save_pretrained(save_path)