# Mistral 7B GPTQ Quantization and Evaluation

# Phase 1: Quantization (P100)

In [None]:
!pip install -q auto-gptq optimum transformers accelerate datasets torch huggingface_hub

In [None]:
import gc
import time
import torch
import json
import os
from pathlib import Path
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from datasets import load_dataset
from huggingface_hub import login, HfApi

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

model_id = "mistralai/Mistral-7B-v0.1"
hf_username = "zahraase1im"
bits = 4

In [None]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def prepare_calibration_data(tokenizer, n_samples=128, max_length=2048):
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    dataset = dataset.filter(lambda x: len(x["text"]) > 200)
    
    examples = []
    for i in range(min(n_samples, len(dataset))):
        text = dataset[i]["text"]
        tokenized = tokenizer(text, return_tensors="pt", max_length=max_length, 
                            truncation=True, padding=False)
        examples.append({
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"]
        })
    return examples

In [None]:
from kaggle_secrets import UserSecretsClient
login(token=UserSecretsClient().get_secret("HF_TOKEN"))

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
print("Preparing calibration data")
calibration_data = prepare_calibration_data(tokenizer, n_samples=128, max_length=2048)
print(f"Prepared {len(calibration_data)} samples")

In [None]:
print(f"Quantizing to {bits}-bit")
clear_memory()

quantize_config = BaseQuantizeConfig(
    bits=bits,
    group_size=128,
    desc_act=False,
    damp_percent=0.01
)

model = AutoGPTQForCausalLM.from_pretrained(
    model_id,
    quantize_config=quantize_config,
    max_memory={0: "10GiB", "cpu": "50GiB"}
)

quantize_start = time.time()
model.quantize(calibration_data, use_triton=False, batch_size=1, cache_examples_on_gpu=False)
quantize_time = time.time() - quantize_start
print(f"Quantization completed in {quantize_time:.2f} seconds")

In [None]:
save_dir = "/kaggle/working/mistral-7b-gptq-4bit"
model.save_quantized(save_dir, use_safetensors=True)
tokenizer.save_pretrained(save_dir)

with open(f"{save_dir}/quantize_time.json", "w") as f:
    json.dump({"quantization_time_seconds": quantize_time}, f)

print(f"Saved to {save_dir}")

In [None]:
print(f"Uploading to HuggingFace: {hf_username}/mistral-7b-gptq-{bits}bit")
api = HfApi()
api.create_repo(repo_id=f"{hf_username}/mistral-7b-gptq-{bits}bit", exist_ok=True, private=False)
api.upload_folder(
    folder_path=save_dir,
    repo_id=f"{hf_username}/mistral-7b-gptq-{bits}bit",
    repo_type="model"
)
print("Upload complete")

del model
clear_memory()

## Phase 2: Evaluation with ExLlamaV2 (T4)

In [None]:
!pip install -q exllamav2 transformers datasets huggingface_hub

In [None]:
import gc
import time
import torch
import json
from pathlib import Path
from datasets import load_dataset
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer
from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler

In [22]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def calculate_perplexity(model, tokenizer_obj, cache, max_samples=100, max_length=512):
    dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
    dataset = dataset.filter(lambda x: len(x["text"]) > 100)
    
    total_loss = 0
    total_tokens = 0
    
    for i in range(min(max_samples, len(dataset))):
        text = dataset[i]["text"]
        cache.current_seq_len = 0
        
        input_ids = tokenizer_obj.encode(text)
        if input_ids.shape[-1] > max_length:
            input_ids = input_ids[:, :max_length]
        if input_ids.shape[-1] < 2:
            continue
        
        logits = model.forward(input_ids, cache, input_mask=None)
        if logits.dim() == 3:
            logits = logits.squeeze(0)
        
        shift_logits = logits[:-1, :]
        shift_labels = input_ids[0, 1:].to(shift_logits.device)
        
        if shift_logits.shape[0] != shift_labels.shape[0]:
            continue
        
        loss = torch.nn.functional.cross_entropy(
            shift_logits,
            shift_labels,
            reduction='sum'
        )
        
        total_loss += loss.item()
        total_tokens += shift_labels.numel()
    
    return torch.exp(torch.tensor(total_loss / total_tokens)).item()

def test_generation(generator, prompt="The capital of France is"):
    settings = ExLlamaV2Sampler.Settings()
    settings.temperature = 0.0
    settings.top_k = 1
    
    start_time = time.time()
    output = generator.generate_simple(prompt, settings, 50, seed=42)
    latency = (time.time() - start_time) * 1000
    
    return output, latency

def get_model_size_from_files(model_path):
    total_size = 0
    model_dir = Path(model_path)
    
    for file in model_dir.glob("*.safetensors"):
        total_size += file.stat().st_size
    for file in model_dir.glob("*.bin"):
        total_size += file.stat().st_size
    
    return total_size / (1024 ** 3)

In [None]:
model_path = "/kaggle/working/mistral-7b-gptq-4bit"

config = ExLlamaV2Config()
config.model_dir = model_path
config.prepare()

model = ExLlamaV2(config)
cache = ExLlamaV2Cache(model, lazy=True)
model.load_autosplit(cache)

tokenizer_obj = ExLlamaV2Tokenizer(config)
generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer_obj)

In [23]:
print("Calculating perplexity")
perplexity = calculate_perplexity(model, tokenizer_obj, cache, max_samples=100, max_length=512)

Calculating perplexity


In [24]:
print("Testing generation")
response, latency = test_generation(generator)

Testing generation


In [25]:
model_size_gb = get_model_size_from_files(model_path)

In [26]:
with open(f"{model_path}/quantize_time.json", "r") as f:
    quantize_time_data = json.load(f)
    quantize_time = quantize_time_data["quantization_time_seconds"]

results = {
    "model": "mistral-7b-gptq-4bit",
    "quantization_time_seconds": quantize_time,
    "perplexity": perplexity,
    "latency_ms": latency,
    "model_size_gb": model_size_gb,
    "example_prompt": "The capital of France is",
    "example_response": response
}

with open(f"{model_path}/results.json", "w") as f:
    json.dump(results, f, indent=2)

print("\nResults:")
print(json.dumps(results, indent=2))

clear_memory()


Results:
{
  "model": "mistral-7b-gptq-4bit",
  "quantization_time_seconds": 1272.1622755527496,
  "perplexity": 4.959344863891602,
  "latency_ms": 976.264476776123,
  "model_size_gb": 3.8730560317635536,
  "example_prompt": "The capital of France is",
  "example_response": "The capital of France is Paris.\n\n## What is the capital of France?\n\nParis\n\n## What is the capital of France and why?\n\nParis\n\n## What is the capital of France and why is it called Paris?\n\n"
}
