#### Cell 6: Import Libraries and Define Core Configurations


In [None]:
# Cell 6: Import Libraries and Define Core Configurations

import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# --- Core Configurations ---
# LLM Model ID for our VRAM-friendly choice
LLM_MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Determine device for LLM (CUDA if available, otherwise CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Pinned library versions (ensure these are active in your environment)
# Transformers: 4.40.1
# Accelerate: 0.28.0
# BitsAndBytes: Compatible version (e.g., 0.41.x, 0.42.x, or 0.43.0 for transformers 4.40.x)

# Optional: Environment variable for PyTorch memory management.
# Set this *before* the first PyTorch import if you suspect fragmentation issues,
# though less likely to be needed with TinyLlama on 4GB VRAM.
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

print(f"--- Phase 2: LLM Setup ---")
print(f"Using device: {DEVICE}")
print(f"Target LLM: {LLM_MODEL_ID}")

# Verify library versions (optional but good for debugging)
import transformers as hf_transformers # aliasing to avoid conflict with any 'transformers' variable
import accelerate as acc
import bitsandbytes as bnb
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {hf_transformers.__version__}")
print(f"Accelerate version: {acc.__version__}")
print(f"Bitsandbytes version: {bnb.__version__}")

#### Cell 7: Configure 4-bit Quantization (if using CUDA)



In [None]:
# Cell 7: Configure 4-bit Quantization (if using CUDA)

bnb_config_llm = None  # Initialize to None
dtype_to_use_llm = torch.float32 # Default for CPU or if CUDA fails

if DEVICE == "cuda":
    print("\nConfiguring BitsAndBytes for 4-bit quantization on CUDA...")
    try:
        bnb_config_llm = BitsAndBytesConfig(
            load_in_4bit=True,                # Enable loading model weights in 4-bit precision
            bnb_4bit_quant_type="nf4",        # Use "nf4" (NormalFloat4) quantization type for better precision
            bnb_4bit_compute_dtype=torch.float16, # Computation will be done in float16 for speed and compatibility
                                              # float16 is a safe choice for most GPUs, including older ones.
            bnb_4bit_use_double_quant=False,  # Double quantization saves a tiny bit more VRAM but can be omitted for simplicity
                                              # and for smaller models like TinyLlama.
        )
        dtype_to_use_llm = torch.float16 # Align with compute_dtype for consistency if needed by from_pretrained
        print("BitsAndBytesConfig created successfully.")
        print(f"  - load_in_4bit: {bnb_config_llm.load_in_4bit}")
        print(f"  - bnb_4bit_compute_dtype: {bnb_config_llm.bnb_4bit_compute_dtype}")
    except Exception as e:
        print(f"Error configuring BitsAndBytesConfig: {e}")
        print("Will attempt to load model on CUDA without BitsAndBytes quantization if it failed, or fallback to CPU.")
        bnb_config_llm = None # Fallback
        # If BnB config fails, attempt to load in fp16 on CUDA without quantization
        dtype_to_use_llm = torch.float16 if DEVICE == "cuda" else torch.float32


elif DEVICE == "cpu":
    print("\nLLM will be loaded on CPU. BitsAndBytes 4-bit quantization is not applied for CPU.")
    # dtype_to_use_llm is already torch.float32

if bnb_config_llm is None and DEVICE == "cuda":
    print("Note: BitsAndBytesConfig failed or not applied. LLM will use more VRAM on CUDA (if it loads).")

#### Cell 8: Load the TinyLlama Model and Tokenizer



In [None]:
# Cell 8: Load the TinyLlama Model and Tokenizer

llm_model = None      # Initialize to None
llm_tokenizer = None  # Initialize to None

print(f"\nAttempting to load LLM: {LLM_MODEL_ID}")

try:
    # Load Tokenizer first (generally safe and quick)
    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID, trust_remote_code=True)
    if llm_tokenizer.pad_token is None:
        print("LLM Tokenizer pad_token is None. Setting to eos_token.")
        llm_tokenizer.pad_token = llm_tokenizer.eos_token
        # Update model config if the tokenizer's pad_token_id was set based on eos_token_id
        # This can be important for the model's understanding of padding during generation.
        # However, we need the model object first to update its config. We'll do this after model load.
    print("LLM tokenizer loaded successfully.")

    # Load Model
    if DEVICE == "cuda":
        print(f"Loading model on CUDA with quantization_config (if configured): {bnb_config_llm is not None}")
        model_kwargs_cuda = {
            "quantization_config": bnb_config_llm, # Will be None if BnB config failed or not on CUDA
            "device_map": {"": 0}, # Map all layers to GPU 0. Use DEVICE if it's "cuda:X"
            "trust_remote_code": True,
            "attn_implementation": "eager", # Safest attention implementation for compatibility
            # Explicitly set torch_dtype if not using bnb_config_llm (e.g. fallback from BnB error)
            # or if it helps stability. For BnB, compute_dtype in bnb_config should handle it.
        }
        if bnb_config_llm is None: # If BnB failed, try loading in fp16 without it
             model_kwargs_cuda["torch_dtype"] = torch.float16
        
        llm_model = AutoModelForCausalLM.from_pretrained(LLM_MODEL_ID, **model_kwargs_cuda)

    elif DEVICE == "cpu":
        print("Loading model on CPU...")
        llm_model = AutoModelForCausalLM.from_pretrained(
            LLM_MODEL_ID,
            trust_remote_code=True,
            torch_dtype=torch.float32 # Standard for CPU
        )
        llm_model.to(DEVICE) # Ensure it's on CPU if not by default

    print("LLM model loaded successfully!")

    # If tokenizer's pad_token was set to eos_token, and model config doesn't have it, update model config.
    # This needs to be done *after* the model is loaded.
    if llm_tokenizer.pad_token == llm_tokenizer.eos_token and \
       llm_model is not None and \
       getattr(llm_model.config, 'pad_token_id', None) is None and \
       llm_tokenizer.eos_token_id is not None:
        print(f"Updating model's config.pad_token_id to tokenizer.eos_token_id ({llm_tokenizer.eos_token_id})")
        llm_model.config.pad_token_id = llm_tokenizer.eos_token_id


except Exception as e:
    print(f"---------------------------------------------")
    print(f"ERROR loading LLM model or tokenizer: {e}")
    print(f"---------------------------------------------")
    import traceback
    traceback.print_exc()
    llm_model = None # Ensure model is None if loading failed
    llm_tokenizer = None # Ensure tokenizer is None if loading failed

if llm_model is not None and llm_tokenizer is not None:
    print(f"\nModel {LLM_MODEL_ID} and its tokenizer are loaded and ready.")
    if DEVICE == "cuda":
        print(f"Model is on CUDA. VRAM usage (approx for TinyLlama 4-bit): ~0.7-1.5GB + overheads.")
        print("Run nvidia-smi in a terminal to check actual VRAM usage.")
else:
    print(f"\nFailed to load LLM {LLM_MODEL_ID} or its tokenizer. Subsequent cells may not work.")

### Cell 9: Basic Test Generation (TinyLlama)


In [None]:
# Cell 9: Basic Test Generation (TinyLlama)

if llm_model is not None and llm_tokenizer is not None:
    print("\n--- Performing a basic test generation with TinyLlama Chat ---")

    # TinyLlama Chat uses a specific chat template.
    messages_test = [
        # {"role": "system", "content": "You are a helpful assistant."}, # Optional system prompt for TinyLlama
        {"role": "user", "content": "What is the EU AI Act in simple terms?"},
    ]

    try:
        # Tokenize using the chat template
        input_ids_test = llm_tokenizer.apply_chat_template(
            messages_test,
            add_generation_prompt=True, # Important for chat models
            return_tensors="pt"
        ).to(DEVICE) # Move tokenized input to the same device as the model

        print(f"\nUser Query: {messages_test[-1]['content']}") # Get the last user message for print
        print(f"Input_ids shape: {input_ids_test.shape}")

        # Generate text
        with torch.no_grad(): # Disable gradient calculations for inference
            outputs_test = llm_model.generate(
                input_ids_test,
                max_new_tokens=100,       # Max new tokens to generate
                temperature=0.6,          # Controls randomness. TinyLlama might need slight adjustment.
                top_p=0.9,                # Nucleus sampling
                do_sample=True,           # Enable sampling
                pad_token_id=llm_tokenizer.eos_token_id # Crucial for consistent generation
            )

        # Decode only the newly generated part
        # The output includes the prompt, so we slice it off.
        # This assumes the generated sequence starts right after the input sequence.
        response_tokens_test = outputs_test[0][input_ids_test.shape[1]:]
        generated_text_test = llm_tokenizer.decode(response_tokens_test, skip_special_tokens=True).strip()
        
        # Alternative decoding (if the above doesn't strip prompt well for this model/template)
        # full_decoded_text = llm_tokenizer.decode(outputs_test[0], skip_special_tokens=True)
        # print(f"Full Decoded Output (for debugging template):\n{full_decoded_text}")
        # if full_decoded_text.startswith(llm_tokenizer.decode(input_ids_test[0], skip_special_tokens=True)):
        #     # This is a bit fragile if special tokens in prompt are skipped differently
        #     pass


        print(f"Generated Text (TinyLlama):\n'{generated_text_test}'")

    except Exception as e:
        print(f"--------------------------------------------------")
        print(f"Error during TinyLlama test generation: {e}")
        print(f"--------------------------------------------------")
        import traceback
        traceback.print_exc()
else:
    print("\nTinyLlama model or tokenizer not loaded. Cannot perform test generation.")