### Install required dependencies

In [None]:
!pip install -q transformers peft accelerate bitsandbytes huggingface_hub

### Upload and Unzip the LoRA Adapter
After fine-tuning the base model, there wil be two primary files created which are used for Model Inference: `adapter_model.safetensors` and `adapter_config.json`.

Steps for the inferencing preparation:
1. **Prepare a ZIP file** named `lora_adapter.zip` that contains the following files:
    - `adapter_model.safetensors` – LoRA fine-tuned weights
    - `adapter_config.json` – Configuration file describing the adapter
    - `chat_template.jinja`
    - `checkpoint.meta`
    - `special_tokens_map.json`
    - `tokenizer.json`
    - `tokenizer_config.json`

2. Since I am using Google Colab to run this Notebook, please **upload the ZIP file** using the file upload tool in the left sidebar or run the `files.upload()` cell below.

3. The ZIP will be automatically extracted to `/content/lora_adapter/` for loading with PEFT.

In [None]:
# You can upload via the left-hand "Files" tab or use this:
from google.colab import files
uploaded = files.upload()  # upload zip or individual files

In [None]:
# If zipped then make a new directory and unzip:
!mkdir -p /content/lora_adapter
!unzip -o "*.zip" -d /content/lora_adapter

### Load the Base Model and LoRA

In [None]:
# Since I use Google Colab, I need to set the Hugging Face token as a secret. 
# Make sure you request the access for the Meta's Llama model repo on Hugging Face and set a new token.
from google.colab import userdata
import os

# Retrieve your token from Colab Secrets
hf_token = userdata.get('hf_inference') # Make sure to set this secret in your Colab environment, replace 'hf_inference' with the key you set

# Set it as an environment variable, which Hugging Face libraries will automatically pick up
os.environ["HF_TOKEN"] = hf_token

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch

base_model_id = "meta-llama/Llama-3.1-8B-Instruct"
lora_path = "/content/lora_adapter" # Check this path matches your uploaded LoRA adapter directory

# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(lora_path)
print("Tokenizer loaded successfully.")

# Load base model in 4-bit quantized mode
print("Loading base model...")
model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    load_in_4bit=True 
)
print("Base model loaded. Loading LoRA adapters...")

# Load the LoRA adapters using Peft's from_pretrained method
model = PeftModel.from_pretrained(model, lora_path)
print("LoRA adapters loaded successfully.")

# --- Merge LoRA Adapters and Base Model (Optional but Recommended for Inference) ---
# Merging the LoRA weights into the base model can sometimes improve inference speed
# and simplifies deployment by creating a single consolidated model.
print("Merging LoRA adapters into the base model...")
model = model.merge_and_unload()
print("LoRA adapters merged.")

# --- Create a Text Generation Pipeline ---
# Use Hugging Face's pipeline for easy text generation.
print("Creating text generation pipeline...")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=200,  # Maximum number of tokens to generate
    do_sample=True,      # Enable sampling for more varied outputs
    temperature=0.7,     # Controls randomness of predictions
    top_p=0.9,           # Nucleus sampling: sample from top p probability tokens
    repetition_penalty=1.1 # Penalize repetition
)
print("Pipeline created successfully.")

# --- Perform Inference ---
print("\n--- Performing Inference ---")

# Example prompt
prompt = "What is Agentic AI?"

# Apply the chat template if your model expects it (Llama-3.1-Instruct does)
# The `chat_template.jinja` file defines how the prompt should be formatted.
messages = [
    {"role": "user", "content": prompt}
]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

print(f"\nInput Prompt:\n{formatted_prompt}\n")

# Generate text
outputs = pipe(formatted_prompt)

# Extract and print the generated text
generated_text = outputs[0]['generated_text']

# Remove the input prompt from the generated text to show only the model's response
# This assumes the model echoes the prompt. Adjust if your model behaves differently.
response = generated_text[len(formatted_prompt):].strip()

print(f"Generated Response:\n{response}")

## Merge and Save the merged model

In [None]:
# Define save path
save_path = "/content/fine_tuned_llama3-8b-merged"

# Save merged model in safetensors format
model.save_pretrained(save_path, safe_serialization=True)

# Save tokenizer
tokenizer.save_pretrained(save_path)

# Optional: Save generation config
model.generation_config.save_pretrained(save_path)