In [9]:
!pip install transformers torch accelerate bitsandbytes sentencepiece huggingface_hub -q

# Log in to Hugging Face Hub
from huggingface_hub import notebook_login
notebook_login()

print("Libraries installed and Hugging Face login initiated.")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Libraries installed and Hugging Face login initiated.


In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import gc # For garbage collection if needed


def load_model_auto_precision(model_id, trust_remote=False):
    print(f"\nLoading model: {model_id} with torch_dtype='auto'...")
    try:
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype="auto", # Let transformers choose best available precision (fp16/bf16)
            device_map="auto",  # Automatically uses GPU if available
            trust_remote_code=trust_remote,
        )
        print(f"{model_id} loaded successfully in {model.dtype}.")
        return tokenizer, model
    except Exception as e:
        print(f"Error loading {model_id}: {e}")
        print("Please ensure you have accepted the license terms on Hugging Face for Llama models and are correctly logged in.")
        return None, None

# --- Helper function for generation ---
def generate_sample_response(model, tokenizer, prompt_text, system_message_content=None):
    if model is None or tokenizer is None:
        print("Model or tokenizer not loaded, skipping generation.")
        return

    messages = []
    if system_message_content:
        messages.append({"role": "system", "content": system_message_content})
    messages.append({"role": "user", "content": prompt_text})

    # Handle potential missing pad_token_id for some tokenizers (especially Llama)
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    try:
        # apply_chat_template handles the specific formatting for the model
        text_input = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True
        )
        model_inputs = tokenizer([text_input], return_tensors="pt").to(model.device)

        print(f"\nGenerating response for: {model.config._name_or_path}")
        print(f"Prompt: {prompt_text}")

        generated_ids = model.generate(
            **model_inputs,
            max_new_tokens=75, # Keep it short for a quick test
            pad_token_id=tokenizer.pad_token_id # Ensure pad_token_id is set
        )
        # Slice off the input tokens from the generated output
        # For some models/tokenizers, generated_ids already contains only new tokens if input_ids are passed.
        # For others, it contains both. A robust way is to decode and check.
        # The common way if input_ids are part of model_inputs:
        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
        if len(output_ids) == 0: # If slicing results in empty, it means generate already gave only new tokens
             output_ids = generated_ids[0]

        response = tokenizer.decode(output_ids, skip_special_tokens=True)
        print(f"Response: {response}")
    except Exception as e:
        print(f"Error during generation for {model.config._name_or_path}: {e}")

In [1]:
##

In [11]:
model_id_qwen = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer_qwen, model_qwen = load_model_auto_precision(model_id_qwen, trust_remote=True)

if model_qwen:
    generate_sample_response(
        model_qwen,
        tokenizer_qwen,
        "What is the capital of France?",
        system_message_content="You are Qwen, created by Alibaba Cloud. You are a helpful assistant."
    )
    # To free up memory if needed later, though less critical now:
    # del model_qwen
    # del tokenizer_qwen
    # gc.collect()
    # torch.cuda.empty_cache()


Loading model: Qwen/Qwen2.5-0.5B-Instruct with torch_dtype='auto'...
Qwen/Qwen2.5-0.5B-Instruct loaded successfully in torch.bfloat16.

Generating response for: Qwen/Qwen2.5-0.5B-Instruct
Prompt: What is the capital of France?
Response: The capital of France is Paris.


In [12]:
model_id_llama1b = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer_llama1b, model_llama1b = load_model_auto_precision(model_id_llama1b) # trust_remote typically False for official Meta Llama

if model_llama1b:
    generate_sample_response(
        model_llama1b,
        tokenizer_llama1b,
        "Explain the concept of photosynthesis in simple terms.",
        system_message_content="You are a helpful science tutor."
    )
    # del model_llama1b
    # del tokenizer_llama1b
    # gc.collect()
    # torch.cuda.empty_cache()


Loading model: meta-llama/Llama-3.2-1B-Instruct with torch_dtype='auto'...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

meta-llama/Llama-3.2-1B-Instruct loaded successfully in torch.bfloat16.

Generating response for: meta-llama/Llama-3.2-1B-Instruct
Prompt: Explain the concept of photosynthesis in simple terms.
Response: Photosynthesis is a fundamental concept in biology that's essential for life on Earth. I'd be happy to explain it in simple terms.

**What is Photosynthesis?**

Photosynthesis is a process that occurs in plants, algae, and some bacteria. It's a way for these organisms to convert sunlight, water, and carbon dioxide into glucose (a type of sugar) and


In [13]:
model_id_llama3b = "meta-llama/Llama-3.2-3B-Instruct"
tokenizer_llama3b, model_llama3b = load_model_auto_precision(model_id_llama3b) # trust_remote typically False for official Meta Llama

if model_llama3b:
    generate_sample_response(
        model_llama3b,
        tokenizer_llama3b,
        "Write a short story about a robot who discovers music.",
        system_message_content="You are a creative storyteller."
    )
    # del model_llama3b
    # del tokenizer_llama3b
    # gc.collect()
    # torch.cuda.empty_cache()


Loading model: meta-llama/Llama-3.2-3B-Instruct with torch_dtype='auto'...


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

meta-llama/Llama-3.2-3B-Instruct loaded successfully in torch.bfloat16.

Generating response for: meta-llama/Llama-3.2-3B-Instruct
Prompt: Write a short story about a robot who discovers music.
Response: In a world of wires and circuits, in a factory where robots hummed and whirred, one machine stood out from the rest. His name was Zeta, a sleek and shiny robot designed to assemble components with precision and speed. For years, Zeta performed his tasks without question, his processes running on autopilot.

But one fateful day, something changed.
