<a href="https://colab.research.google.com/github/yashpandey030303/SLM-Quantization-research-implementation/blob/main/Final_SLM_Quantisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
print("--- Step 1: Forcefully uninstalling old libraries ---")
!pip uninstall -y transformers accelerate

In [None]:
print("\n--- Step 1.1: Installing the correct, latest libraries ---")
!pip install -q "accelerate>=0.28.0"
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install -q datasets bitsandbytes

In [None]:
!pip show transformers

In [None]:
import torch
import time
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset

print("Libraries installed and imported successfully!\n")


In [None]:
print("--- Step 2: Configuring models ---")
model_ids = [
    "microsoft/Phi-3-mini-4k-instruct",
    "Qwen/Qwen1.5-1.8B-Chat"
]
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)
print("Model IDs and quantization config are set.\n")

In [None]:
print("--- Step 3: Loading and preparing the XNLI dataset ---")

# Load the first 50 samples from the English and Hindi validation sets directly.
try:
    en_samples = load_dataset("xnli", "en", split="validation").select(range(50))
    hi_samples = load_dataset("xnli", "hi", split="validation").select(range(50))
    xnli_samples = {"en": en_samples, "hi": hi_samples}

    # The labels are numeric (0=entailment, 1=neutral, 2=contradiction). We map them to text for the model.
    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    print(f"Loaded {len(en_samples)} English and {len(hi_samples)} Hindi samples.\n")

except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")
    print("Please ensure you have a stable internet connection.")

In [None]:
def evaluate_model(model_id, quantization_config=None):
    """
    Loads a model and evaluates its performance on the XNLI samples.
    Measures accuracy, average latency, and peak GPU memory usage.
    """
    print(f"\nLoading tokenizer for {model_id}...")
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Set device_map: Quantized models MUST be on GPU. Baseline can be auto (may offload to CPU).
    device_map_setting = "cuda" if quantization_config else "auto"
    print(f"Loading model {model_id} with device_map='{device_map_setting}' and {'4-bit quantization' if quantization_config else 'no quantization'}...")

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            quantization_config=quantization_config,
            device_map=device_map_setting,
            trust_remote_code=True,
            attn_implementation="eager", # Use SDPA for potentially better memory/speed and stability
            torch_dtype=torch.bfloat16 # Use bfloat16 for Phi-3 baseline if not quantized, for better memory
        )
    except Exception as e:
        print(f"ERROR: Failed to load model {model_id}. This is often due to OOM or incompatibilities.")
        print(f"Details: {e}")
        return [] # Return empty list to prevent further errors for this model

    results = []

    print(f"\n--- Evaluating Model: {model_id} ({'4-bit Quantized' if quantization_config else 'Baseline'}) ---")

    for lang, samples in xnli_samples.items():
        correct_predictions = 0
        total_latency = 0

        # Reset GPU memory stats for accurate measurement per language
        # This will only be relevant if the model itself fits on GPU
        if torch.cuda.is_available():
            torch.cuda.reset_peak_memory_stats()

        for i, example in enumerate(samples):
            premise = example['premise']
            hypothesis = example['hypothesis']
            true_label = label_map[example['label']]

            prompt = (
                f"Premise: '{premise}'\n"
                f"Hypothesis: '{hypothesis}'\n"
                "Based on the premise, does the hypothesis mean entailment, neutral, or contradiction? "
                "Answer with only one word."
            )

            messages = [{"role": "user", "content": prompt}]
            inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

            # Add attention_mask to avoid warning and for more reliable results
            # And use_cache=False to avoid DynamicCache related errors, as discussed
            start_time = time.time()
            outputs = model.generate(
                inputs,
                max_new_tokens=5,
                pad_token_id=tokenizer.eos_token_id,
                attention_mask=inputs.attention_mask if hasattr(inputs, 'attention_mask') else None,
                use_cache=False # Crucial for stability with potentially problematic caches, and for small outputs like 5 tokens, performance hit is minimal.
            )
            end_time = time.time()
            total_latency += (end_time - start_time)

            response_text = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True).lower().strip()

            if true_label in response_text:
                correct_predictions += 1

            # Print progress
            if (i + 1) % 10 == 0 or (i + 1) == len(samples): # Print every 10 samples or at the end
                print(f"  [{lang.upper()}] Sample {i+1}/{len(samples)} | Pred: '{response_text}' | True: '{true_label}'")

        # Calculate metrics
        accuracy = (correct_predictions / len(samples)) * 100
        avg_latency = total_latency / len(samples)
        peak_memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0 # Handle CPU offload

        results.append({
            "Model": model_id.split('/')[-1],
            "Type": "4-bit Quantized" if quantization_config else "Baseline",
            "Language": lang.upper(),
            "Accuracy (%)": f"{accuracy:.2f}",
            "Avg Latency (s/ex)": f"{avg_latency:.4f}",
            "Peak VRAM (MB)": f"{peak_memory_mb:.2f}"
        })

    # Clear memory after evaluation for this model
    del model
    del tokenizer
    torch.cuda.empty_cache()

    return results

print("Evaluation function is ready.\n")

In [None]:
import os

In [None]:
print("--- Step 5: Starting experiments incrementally ---")
results_filename = "slm_quantization_results.csv"

# Clear previous results file if it exists
if os.path.exists(results_filename):
    os.remove(results_filename)
    print(f"Removed existing '{results_filename}'.")

# Header for the CSV file
csv_header = ["Model", "Type", "Language", "Accuracy (%)", "Avg Latency (s/ex)", "Peak VRAM (MB)"]

In [None]:
print("\n--- Running Experiment 1: Phi-3 Baseline ---")
model_id_phi = "microsoft/Phi-3-mini-4k-instruct"
phi_baseline_results = evaluate_model(model_id_phi, quantization_config=None)
df_phi_baseline = pd.DataFrame(phi_baseline_results, columns=csv_header)
df_phi_baseline.to_csv(results_filename, mode='w', header=True, index=False)
print(f"✅ Phi-3 Baseline results saved to {results_filename}")

In [None]:
print("\n--- Running Experiment 1: Phi-3 Quantized ---")
model_id_phi = "microsoft/Phi-3-mini-4k-instruct"
phi_baseline_results = evaluate_model(model_id_phi, quantization_config=quantization_config)
df_phi_baseline = pd.DataFrame(phi_baseline_results, columns=csv_header)
df_phi_baseline.to_csv(results_filename, mode='w', header=True, index=False)
print(f"✅ Phi-3 Baseline results saved to {results_filename}")

In [None]:
# --- Experiment 3: Qwen Baseline ---
print("\n--- Running Experiment 3: Qwen Baseline ---")
model_id_qwen = "Qwen/Qwen1.5-1.8B-Chat"
qwen_baseline_results = evaluate_model(model_id_qwen, quantization_config=None)
df_qwen_baseline = pd.DataFrame(qwen_baseline_results, columns=csv_header)
df_qwen_baseline.to_csv(results_filename, mode='a', header=False, index=False)
print(f"✅ Qwen Baseline results appended to {results_filename}")

In [None]:
# --- Experiment 4: Qwen Quantized ---
print("\n--- Running Experiment 4: Qwen Quantized ---")
model_id_qwen = "Qwen/Qwen1.5-1.8B-Chat"
qwen_quantized_results = evaluate_model(model_id_qwen, quantization_config=quantization_config)
df_qwen_quantized = pd.DataFrame(qwen_quantized_results, columns=csv_header)
df_qwen_quantized.to_csv(results_filename, mode='a', header=False, index=False)
print(f"✅ Qwen Quantized results appended to {results_filename}")

In [None]:
print("\n--- Final Results Table ---")

try:
    final_results_df = pd.read_csv(results_filename)
    print(final_results_df.to_string())
except FileNotFoundError:
    print("The results file was not found. This should not happen if all experiments ran correctly.")