# Model Loading Debug Notebook

Simple notebook to test and debug the model loading function.


In [None]:
import os
import torch
from transformers import pipeline, Pipeline, BitsAndBytesConfig
from typing import Dict, Any, Optional


In [None]:
def get_quantization_config(quantization: str) -> Optional[BitsAndBytesConfig]:
    """
    Get quantization configuration based on string identifier.
    
    Args:
        quantization: Quantization type ('none', '8bit', '4bit')
        
    Returns:
        BitsAndBytesConfig object or None
        
    Raises:
        ValueError: If quantization type is unknown
    """
    if quantization == "none":
        return None
    elif quantization == "8bit":
        return BitsAndBytesConfig(load_in_8bit=True)
    elif quantization == "4bit":
        return BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16
        )
    else:
        raise ValueError(f"Unknown quantization type: {quantization}")


In [None]:
def load_model_pipeline(model_config: Dict[str, Any], hf_token: Optional[str] = None) -> Pipeline:
    """
    Load and configure a model pipeline.
    
    Args:
        model_config: Model configuration dictionary
        hf_token: Hugging Face token for authentication
        
    Returns:
        Configured text generation pipeline
    """
    # Setup hub kwargs for authentication
    hub_kwargs = {"token": hf_token} if hf_token else {}
    
    # Setup quantization and model_kwargs
    model_kwargs = {}
    quantization = model_config.get('quantization', 'none')
    torch_dtype = model_config.get('torch_dtype', 'auto')
    
    if quantization != 'none':
        model_kwargs['quantization_config'] = get_quantization_config(quantization)
        # Force auto dtype when using quantization
        torch_dtype = 'auto'
    
    # Create pipeline - use dtype parameter for transformers 4.55+
    return pipeline(
        "text-generation",
        model=model_config['model_id'],
        dtype=torch_dtype,  # Changed from torch_dtype to dtype
        device_map=model_config.get('device_map', 'auto'),
        model_kwargs=model_kwargs,
        **hub_kwargs
    )


## Model Configuration

Edit this cell to test different model configurations:


In [None]:
# Model configuration - modify as needed
model_config = {
    'model_id': "meta-llama/Llama-3.2-1B-Instruct",
    'quantization': "none",  # Options: "none", "8bit", "4bit"
    'device_map': "auto",
    'torch_dtype': "auto"
}

# Get HuggingFace token from environment (set HF_TOKEN in your .env or environment)
hf_token = os.getenv('HF_TOKEN')
print(f"HF Token available: {hf_token is not None}")
print(f"Model ID: {model_config['model_id']}")
print(f"Quantization: {model_config['quantization']}")
print(f"Device Map: {model_config['device_map']}")
print(f"Torch Dtype: {model_config['torch_dtype']}")


## Load Model

This is where the model loading happens - any errors will appear here:


In [None]:
# Load the model pipeline
print("Starting model loading...")
pipe = load_model_pipeline(model_config, hf_token)
print("Model loaded successfully!")

# Check memory footprint
if hasattr(pipe.model, 'get_memory_footprint'):
    footprint_gb = pipe.model.get_memory_footprint() / (1024 ** 3)
    print(f"Model memory footprint: {footprint_gb:.2f} GB")


## Test Generation (Optional)

Test that the model works with a simple generation:


In [None]:
# Optional: Test generation
test_prompt = "Hello, how are you?"
print(f"Testing with prompt: '{test_prompt}'")

result = pipe(
    test_prompt,
    max_new_tokens=50,
    do_sample=False,
    return_full_text=False
)

print("\nGenerated output:")
print(result)


## Cleanup

Free up memory when done:


In [None]:
# Cleanup
del pipe
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("GPU memory cleared")
else:
    print("No GPU detected")
