# Llama-3 GPTQ Quantization and Evaluation

This notebook demonstrates the complete pipeline for quantizing Llama-3 models using GPTQ and evaluating the results.

## Setup

In [None]:
# Install required packages
!pip install -q transformers accelerate datasets huggingface_hub gptqmodel safetensors

In [None]:
import sys
import os
from pathlib import Path

# Add package to path
sys.path.insert(0, str(Path.cwd().parent.parent))

from innova_llama3_gptq import quantize_llama3_gptq, GPTQConfig
from innova_llama3_gptq.evals import (
    evaluate_perplexity_suite,
    measure_inference_latency,
    create_results_summary
)

# Set up logging
import logging
logging.basicConfig(level=logging.INFO)

## Configuration

In [None]:
# Model configuration
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"  # Change this to your model
OUTPUT_DIR = "artifacts/notebook_gptq"

# Quantization configuration
BITS = 4
GROUP_SIZE = 128
CALIBRATION_SAMPLES = 512

# Set HF token if needed
HF_TOKEN = os.environ.get("HF_TOKEN", None)

## Step 1: Quantization

In [None]:
print(f"Quantizing {MODEL_ID} to {BITS}-bit GPTQ...")

quantized_model_path = quantize_llama3_gptq(
    model_id=MODEL_ID,
    bits=BITS,
    group_size=GROUP_SIZE,
    desc_act=True,
    calib_dataset="wikitext2",
    max_calib_samples=CALIBRATION_SAMPLES,
    out_dir=OUTPUT_DIR,
    use_safetensors=True,
    seed=42,
    auth_token=HF_TOKEN
)

print(f"✅ Quantization complete! Model saved to: {quantized_model_path}")

## Step 2: Evaluation

In [None]:
# Evaluate perplexity
print("Evaluating perplexity...")

perplexity_results = evaluate_perplexity_suite(
    model_path=quantized_model_path,
    datasets=["wikitext2"],
    max_samples_per_dataset=500
)

for dataset, metrics in perplexity_results.items():
    if "perplexity" in metrics:
        print(f"  {dataset}: {metrics['perplexity']:.2f}")

In [None]:
# Measure inference latency
print("Measuring inference latency...")

latency_results = measure_inference_latency(
    model_path=quantized_model_path,
    batch_sizes=[1, 4],
    sequence_length=512,
    num_iterations=5
)

for batch_key, metrics in latency_results.items():
    print(f"  {batch_key}: {metrics['avg_latency_ms']:.2f}ms")

## Step 3: Generate Test Outputs

In [None]:
from transformers import AutoTokenizer
from gptqmodel import GPTQModel

# Load quantized model
tokenizer = AutoTokenizer.from_pretrained(quantized_model_path)
model = GPTQModel.load(
    quantized_model_path,
    device_map="auto"
)

# Test generation
test_prompts = [
    "The future of artificial intelligence is",
    "Climate change can be addressed by",
    "The meaning of life is"
]

for prompt in test_prompts:
    print(f"\nPrompt: {prompt}")
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_new_tokens=30,
        temperature=0.7,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Response: {response}")

## Step 4: Create Summary Report

In [None]:
from scripts.utils import get_hardware_info, get_model_size

# Create comprehensive summary
summary = create_results_summary(
    perplexity_results=perplexity_results,
    latency_results=latency_results,
    model_info={
        "model_id": MODEL_ID,
        "quantization": {
            "bits": BITS,
            "group_size": GROUP_SIZE,
            "method": "gptq"
        },
        "size": get_model_size(Path(quantized_model_path))
    },
    hardware_info=get_hardware_info()
)

# Display key metrics
print("\n" + "="*50)
print("QUANTIZATION SUMMARY")
print("="*50)

if "summary_metrics" in summary:
    for metric, value in summary["summary_metrics"].items():
        if isinstance(value, float):
            print(f"{metric}: {value:.3f}")
        else:
            print(f"{metric}: {value}")

## Step 5: Export to Hugging Face (Optional)

In [None]:
# Generate model card
from scripts.export_hf_gptq import build_model_card

REPO_ID = "innova/llama3-8b-instruct-gptq-4bit"  # Change this

model_card = build_model_card(
    model_dir=Path(quantized_model_path),
    base_model=MODEL_ID,
    repo_id=REPO_ID,
    results_path=None  # Would point to results directory if available
)

# Save model card
with open(Path(quantized_model_path) / "README.md", "w") as f:
    f.write(model_card)

print(f"Model card generated for {REPO_ID}")
print("\nTo upload to Hugging Face Hub:")
print(f"1. huggingface-cli login")
print(f"2. huggingface-cli upload {REPO_ID} {quantized_model_path} . --repo-type model")

## Comparison: FP16 vs GPTQ (Optional)

In [None]:
# This cell would compare FP16 baseline with GPTQ if you have baseline results
# from innova_llama3_gptq.evals import create_comparison_table
# import pandas as pd

# baseline_results = {...}  # Load baseline results
# quantized_results = summary

# comparison_df = create_comparison_table(baseline_results, quantized_results)
# display(comparison_df)