In [1]:
from transformers import GPT2LMHeadModel, GPT2Config, GPT2Tokenizer, BitsAndBytesConfig
from datasets import load_dataset
import torch
import platform
import time
import wandb 
import numpy as np
from tqdm import tqdm
import logging
import json, pprint
from peft import PeftModel, PeftConfig

# Wandb Benchmarks 

In [2]:
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU model: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("No Cuda!")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Python version: 3.10.15
PyTorch version: 2.4.1+cu121
CUDA available: True
CUDA version: 12.1
GPU model: Tesla T4
Number of GPUs: 1
Available GPU memory: 15.64 GB
Using device: cuda


In [5]:
import torch
import time, math, logging
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GenerationConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import pandas as pd
import gc
import wandb
import numpy as np
from tqdm.auto import tqdm
import os
import json # Not strictly needed in this version but often useful with adapters
from transformers import GPT2Config   
original_base_model_name_for_tokenizer = "gpt2" # Tokenizer should ideally match what was saved with merged_fp16
flash_cfg = GPT2Config.from_pretrained(original_base_model_name_for_tokenizer)
flash_cfg.use_flash_attention = True     

for h in logging.root.handlers[:]:
    logging.root.removeHandler(h)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", force=True)
logger = logging.getLogger(__name__)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using primary device for stats: {device}")

model_path_to_evaluate = "./saved_models/distilled_lora_prepruned"    
original_base_model_name_for_tokenizer = "gpt2" # Tokenizer should ideally match what was saved with merged_fp16
max_length = 128
inference_batch_size = 16
num_inference_batches = 50
run_inference_benchmark = True

bnb_config_for_4bit_quantized_load = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

try:
    run = wandb.init(
        project="Quantized and Flash Enabled", # Updated project
        name=f"Eval_{os.path.basename(model_path_to_evaluate)}_vs_4bit_UserLoad",
        config={
            "model_path_evaluated": model_path_to_evaluate,
            "original_base_model_name_for_tokenizer": original_base_model_name_for_tokenizer,
            "max_length": max_length,
            "inference_batch_size": inference_batch_size,
            "num_inference_batches": num_inference_batches,
            "run_inference_benchmark": run_inference_benchmark,
            "bnb_config_for_4bit_version": bnb_config_for_4bit_quantized_load.to_dict()
        }
    )
    logger.info("Weights & Biases initialized successfully.")
except Exception as e:
    logger.error(f"Failed to initialize Weights & Biases: {e}")
    run = None

logger.info("Loading data...")
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
tokenizer = GPT2Tokenizer.from_pretrained(original_base_model_name_for_tokenizer)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
logger.info(f"Using tokenizer: {tokenizer.name_or_path} with padding_side='{tokenizer.padding_side}'")

val_texts_full = [t for t in dataset["validation"]["text"] if t.strip()]
test_texts_full = val_texts_full[:inference_batch_size * num_inference_batches]
logger.info(f"Data loaded. Val/Test texts: {len(val_texts_full)}")

@torch.no_grad()
def compute_perplexity(model, tokenizer, texts, device, batch_size=8, max_length=128):
    model.eval()
    losses = []
    total_evaluated = 0
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        if not batch: continue
        total_evaluated += len(batch)
        inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)
        outputs = model(**inputs, labels=inputs.input_ids)
        if hasattr(outputs, 'loss') and outputs.loss is not None:
            losses.append(outputs.loss.item() * len(batch))
    if not losses or total_evaluated == 0: return float('inf')
    avg_loss = sum(losses) / total_evaluated
    if avg_loss <= 0: return float('inf')
    return math.exp(avg_loss)

@torch.no_grad()
def benchmark_inference(model, tokenizer, texts, eval_device_ignored, batch_size=8, max_length=128, num_batches=50, generation=False):
    model.eval()
    latencies = []
    total_samples = 0
    logger.info(f"--- Starting Inference Benchmark (Generation: {generation}) ---")
    generation_config = GenerationConfig(max_new_tokens=5, pad_token_id=tokenizer.pad_token_id, eos_token_id=model.config.eos_token_id if hasattr(model.config, 'eos_token_id') else tokenizer.eos_token_id) if generation else None
    
    for i in tqdm(range(0, min(len(texts), batch_size * num_batches), batch_size), desc=f"Inference Gen={generation}", leave=False):
        batch_texts = texts[i:i+batch_size]
        if not batch_texts: continue
        
        model_input_device = next(model.parameters()).device
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(model_input_device)
        
        batch_samples = inputs['input_ids'].shape[0]; total_samples += batch_samples
        start_time = time.perf_counter()
        if generation: _ = model.generate(**inputs, generation_config=generation_config)
        else: _ = model(**inputs)
        if model_input_device.type == "cuda": torch.cuda.synchronize(model_input_device)
        end_time = time.perf_counter(); latencies.append(end_time - start_time)
    if not latencies: return {"avg_inference_latency_ms_per_sample": float('nan'), "avg_inference_throughput_samples_sec": float('nan')}
    total_time_secs = sum(latencies)
    throughput_samples_sec = total_samples / total_time_secs if total_time_secs > 0 else 0
    avg_latency_sample = (total_time_secs / total_samples) * 1000 if total_samples > 0 else 0
    logger.info(f"--- Finished Inference Benchmark (Generation: {generation}) ---")
    return {"avg_inference_latency_ms_per_sample": avg_latency_sample, "avg_inference_throughput_samples_sec": throughput_samples_sec}

all_results = {}

def evaluate_model_version(model_load_path, model_label, load_quantized, bnb_config=None):
    logger.info(f"\n===== Evaluating: {model_label} from {model_load_path} =====")
    model = None
    eval_results = {}
    if device.type == "cuda": torch.cuda.reset_peak_memory_stats(device)

    try:
        if not os.path.exists(model_load_path):
            raise FileNotFoundError(f"Model path does not exist: {model_load_path}")

        if load_quantized and bnb_config:
            logger.info("Loading model with 4-bit quantization and device_map='auto'...")
            model = AutoModelForCausalLM.from_pretrained(
                model_load_path,
                quantization_config=bnb_config,
                config=flash_cfg, 
                device_map="auto"
            )
        else:
            logger.info(f"Loading model as GPT2LMHeadModel and moving to device: {device}...")
            model = GPT2LMHeadModel.from_pretrained(model_load_path,config=flash_cfg).to(device)
        
        logger.info(f"Successfully loaded {model_label}")
        logger.info(f"Model type: {type(model)}")
        
        eval_device_for_model = next(model.parameters()).device if next(model.parameters(), None) is not None else device

        ppl = compute_perplexity(model, tokenizer, val_texts_full, eval_device_for_model,
                                   batch_size=inference_batch_size, max_length=max_length)
        eval_results["final_ppl"] = ppl
        logger.info(f"{model_label} - Final Validation Perplexity: {ppl:.2f}")

        if run_inference_benchmark:
            logger.info(f"Running inference benchmark for {model_label}...")
            inference_res_fwd = benchmark_inference(model, tokenizer, test_texts_full, eval_device_for_model, batch_size=inference_batch_size, max_length=max_length, num_batches=num_inference_batches, generation=False)
            inference_res_gen = benchmark_inference(model, tokenizer, test_texts_full, eval_device_for_model, batch_size=inference_batch_size, max_length=max_length, num_batches=num_inference_batches // 2, generation=True)
            eval_results.update({
                 "fwd_pass_latency_ms": inference_res_fwd["avg_inference_latency_ms_per_sample"],
                 "fwd_pass_throughput": inference_res_fwd["avg_inference_throughput_samples_sec"],
                 "gen_latency_ms": inference_res_gen["avg_inference_latency_ms_per_sample"],
                 "gen_throughput": inference_res_gen["avg_inference_throughput_samples_sec"]})
        
        current_mem_eval = 0
        if device.type == "cuda": torch.cuda.synchronize(); current_mem_eval = torch.cuda.max_memory_allocated(device) / 1024**2
        eval_results["peak_mem_mb_eval"] = current_mem_eval
        
        all_results[model_label] = eval_results
        if run and 'final_ppl' in eval_results and not math.isinf(eval_results['final_ppl']):
            wandb.log({f"Summary/{model_label}/{k}": v for k, v in eval_results.items()})

    except Exception as e:
        logger.error(f"An error occurred during {model_label} evaluation: {e}")
        all_results[model_label] = {"final_ppl": float('inf'), "peak_mem_mb_eval": "Error"}
    finally:
        if model is not None: del model
        gc.collect(); torch.cuda.empty_cache() if device.type == "cuda" else None

evaluate_model_version(
    model_load_path=model_path_to_evaluate,
    model_label="Distilled LoRA Pre-Pruned (Loaded FP)", 
    load_quantized=False
)

evaluate_model_version(
    model_load_path=model_path_to_evaluate, 
    model_label="Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)",
    load_quantized=True,
    bnb_config=bnb_config_for_4bit_quantized_load
)

logger.info("\n===== Final Evaluation Comparison =====")
results_list_df = []
indices_df = []
ordered_labels = [
    "Distilled LoRA Pre-Pruned (Loaded FP)",
    "Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)"
]
for label in ordered_labels:
    if label in all_results:
        results_list_df.append(all_results[label])
        indices_df.append(label)

if results_list_df:
    df = pd.DataFrame(results_list_df, index=indices_df)
    cols_to_rename = {"peak_mem_mb_eval": "Peak GPU Mem (MB) Eval", "final_ppl": "Final Val PPL"}
    if run_inference_benchmark:
        cols_to_rename.update({"fwd_pass_latency_ms": "Fwd Latency (ms)", "fwd_pass_throughput": "Fwd TP (samples/s)", "gen_latency_ms": "Gen Latency (ms)", "gen_throughput": "Gen TP (samples/s)"})
    df_display = df.rename(columns=cols_to_rename)
    display_columns_present = [col for col in ["Final Val PPL", "Peak GPU Mem (MB) Eval", "Fwd Latency (ms)", "Fwd TP (samples/s)", "Gen Latency (ms)", "Gen TP (samples/s)"] if col in df_display.columns]
    df_display = df_display[display_columns_present]
    format_map = {"Peak GPU Mem (MB) Eval": '{:,.1f}', "Final Val PPL": '{:.2f}', "Fwd Latency (ms)": '{:.1f}', "Fwd TP (samples/s)": '{:.1f}', "Gen Latency (ms)": '{:.1f}', "Gen TP (samples/s)": '{:.1f}'}
    for col, fmt in format_map.items():
        if col in df_display.columns:
            try: df_display[col] = df_display[col].apply(lambda x: fmt.format(x) if isinstance(x, (int, float)) and pd.notnull(x) and not (isinstance(x, float) and (math.isnan(x) or math.isinf(x))) else x)
            except Exception: logger.warning(f"Could not format column {col}. Skipping formatting.")
    logger.info("\nComparison DataFrame:\n%s", df_display.to_string())
    if run:
        try: 
            df_log = df_display.reset_index().rename(columns={'index': 'Method'})
            wandb.log({"Quantization_Impact_Comparison_Table": wandb.Table(dataframe=df_log)}) # Changed table name
            logger.info("Comparison table logged to Weights & Biases.")
        except Exception as e: logger.error(f"Failed to log DataFrame to Weights & Biases: {e}")
else:
    logger.error("No successful benchmark runs to compare.")

if run:
    wandb.finish()
    logger.info("Weights & Biases run finished.")
logger.info("\n===== Script Finished =====")

2025-05-08 01:42:58,650 - INFO - Using primary device for stats: cuda
[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mym3064[0m ([33mhpml_final_project[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


2025-05-08 01:43:01,367 - INFO - Weights & Biases initialized successfully.
2025-05-08 01:43:01,368 - INFO - Loading data...
2025-05-08 01:43:06,959 - INFO - Using tokenizer: gpt2 with padding_side='right'
2025-05-08 01:43:06,969 - INFO - Data loaded. Val/Test texts: 2461
2025-05-08 01:43:06,971 - INFO - 
===== Evaluating: Distilled LoRA Pre-Pruned (Loaded FP) from ./saved_models/distilled_lora_prepruned =====
2025-05-08 01:43:06,973 - INFO - Loading model as GPT2LMHeadModel and moving to device: cuda...
2025-05-08 01:43:10,293 - INFO - Successfully loaded Distilled LoRA Pre-Pruned (Loaded FP)
2025-05-08 01:43:10,294 - INFO - Model type: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
2025-05-08 01:43:35,510 - INFO - Distilled LoRA Pre-Pruned (Loaded FP) - Final Validation Perplexity: 8.35
2025-05-08 01:43:35,511 - INFO - Running inference benchmark for Distilled Lo

Inference Gen=False:   0%|          | 0/50 [00:00<?, ?it/s]

2025-05-08 01:43:43,123 - INFO - --- Finished Inference Benchmark (Generation: False) ---
2025-05-08 01:43:43,126 - INFO - --- Starting Inference Benchmark (Generation: True) ---


Inference Gen=True:   0%|          | 0/25 [00:00<?, ?it/s]

`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 50256}. If this is not desired, please set these values explicitly.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

Inference Gen=False:   0%|          | 0/50 [00:00<?, ?it/s]

2025-05-08 01:44:03,228 - INFO - --- Finished Inference Benchmark (Generation: False) ---
2025-05-08 01:44:03,231 - INFO - --- Starting Inference Benchmark (Generation: True) ---


Inference Gen=True:   0%|          | 0/25 [00:00<?, ?it/s]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='le

0,1
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/final_ppl,▁
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/fwd_pass_latency_ms,▁
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/fwd_pass_throughput,▁
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/gen_latency_ms,▁
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/gen_throughput,▁
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/peak_mem_mb_eval,▁
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/final_ppl,▁
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/fwd_pass_latency_ms,▁
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/fwd_pass_throughput,▁
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/gen_latency_ms,▁

0,1
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/final_ppl,8.3456
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/fwd_pass_latency_ms,8.6675
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/fwd_pass_throughput,115.37356
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/gen_latency_ms,12.2703
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/gen_throughput,81.49762
Summary/Distilled LoRA Pre-Pruned (Loaded FP)/peak_mem_mb_eval,2112.68164
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/final_ppl,8.45785
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/fwd_pass_latency_ms,3.02657
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/fwd_pass_throughput,330.40694
Summary/Distilled LoRA Pre-Pruned (Loaded Quant 4-bit)/gen_latency_ms,7.83654


2025-05-08 01:44:10,158 - INFO - Weights & Biases run finished.
2025-05-08 01:44:10,159 - INFO - 
===== Script Finished =====


# Group 2 metrics

In [6]:
def benchmark_inference(model, tokenizer, dataset, batch_size, max_length=128, num_batches=100):
    import time, torch, numpy as np
    device = next(model.parameters()).device
    test_texts = [t for t in dataset["test"]["text"] if t.strip()]

    model.eval()
    infer_times, infer_mems = [], []
    infer_thrpts, infer_perps = [], []

    with torch.no_grad():
        for i in range(num_batches):
            batch = test_texts[i*batch_size:(i+1)*batch_size]
            if not batch:
                break

            inputs = tokenizer(batch, padding=True, truncation=True, max_length=max_length, return_tensors="pt").to(device)

            start = time.time()
            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss
            perp = torch.exp(loss).item()
            elapsed = time.time() - start
            mem = torch.cuda.memory_allocated(device) / 1024**2

            infer_times.append(elapsed)
            infer_mems.append(mem)
            infer_thrpts.append(batch_size / elapsed)
            infer_perps.append(perp)

    return {
        "time":       (np.mean(infer_times),   np.std(infer_times)),
        "memory":     (np.mean(infer_mems),    np.std(infer_mems)),
        "throughput": (np.mean(infer_thrpts),  np.std(infer_thrpts)),
        "perplexity": (np.mean(infer_perps),   np.std(infer_perps))
    }


In [7]:
max_length = 128
model_path_to_evaluate = "./saved_models/distilled_lora_prepruned"
bnb_config_for_4bit_quantized_load = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
                model_path_to_evaluate,
                quantization_config=bnb_config_for_4bit_quantized_load,
                device_map="auto",
                config=flash_cfg   
            )

for bs in [8, 16, 32]:
    logger.info(f"\nRunning benchmark for Quant+Flash model with batch_size={bs}, max_length={max_length}")
    logger.info("Configuration:")
    logger.info(f"  • Model:  Quant+Flash on GPT-2")
    logger.info(f"  • Batch size:   {bs}")
    logger.info(f"  • Max length:   {max_length}")

    infer_stats = benchmark_inference(model, tokenizer, dataset, batch_size=bs)

#     # Training results
#     t = train_stats
#     logger.info("\nTraining:")
#     logger.info(f"  Average time per batch:    {t['time'][0]:.4f} ± {t['time'][1]:.4f} seconds")
#     logger.info(f"  Average memory usage:      {t['memory'][0]:.2f} ± {t['memory'][1]:.2f} MB")
#     logger.info(f"  Average throughput:        {t['throughput'][0]:.2f} ± {t['throughput'][1]:.2f} samples/second")
#     logger.info(f"  Average loss:              {t['loss'][0]:.4f} ± {t['loss'][1]:.4f}")
#     logger.info(f"  Average perplexity:        {t['perplexity'][0]:.4f} ± {t['perplexity'][1]:.4f}")

    # Inference results
    i = infer_stats
    logger.info("\nInference:")
    logger.info(f"  Average time per batch:    {i['time'][0]:.4f} ± {i['time'][1]:.4f} seconds")
    logger.info(f"  Average memory usage:      {i['memory'][0]:.2f} ± {i['memory'][1]:.2f} MB")
    logger.info(f"  Average throughput:        {i['throughput'][0]:.2f} ± {i['throughput'][1]:.2f} samples/second")
    logger.info(f"  Average perplexity:        {i['perplexity'][0]:.4f} ± {i['perplexity'][1]:.4f}")

2025-05-08 01:44:10,245 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
2025-05-08 01:44:10,718 - INFO - 
Running benchmark for Quant+Flash model with batch_size=8, max_length=128
2025-05-08 01:44:10,719 - INFO - Configuration:
2025-05-08 01:44:10,720 - INFO -   • Model:  Quant+Flash on GPT-2
2025-05-08 01:44:10,720 - INFO -   • Batch size:   8
2025-05-08 01:44:10,721 - INFO -   • Max length:   128
2025-05-08 01:44:14,548 - INFO - 
Inference:
2025-05-08 01:44:14,549 - INFO -   Average time per batch:    0.0305 ± 0.0046 seconds
2025-05-08 01:44:14,549 - INFO -   Average memory usage:      277.50 ± 37.13 MB
2025-05-08 01:44:14,550 - INFO -   Average throughput:        268.01 ± 41.32 samples/second
2025-05-08 01:44:14,551 - INFO -   Average perplexity:        11.6864 ± 7.8461
2025-05-08 01:44:14,551 - INFO - 
Running benchmark for Quant+Flash 