In [3]:
import sys
from pathlib import Path

# Point to root if running inside Jupyter
ROOT = Path().resolve().parent
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))

from qcbai.llm.registry import get_all_model_runners

# Enable only specific model families
model_types = ["ollama", "transformers"]

# Load models
models = get_all_model_runners(model_types)

# Display all model names
print("Loaded Models:")
for model in models:
    print(f" - {model.get_name()}")

# Try running a prompt
sample_messages = [
    {"role": "user", "content": "2+3=?"}
]

for model in models:
    print(f"\nRunning: {model.get_name()}")
    output = model.run_prompt(sample_messages)
    print("Response:", output["text"][:300], "..." if len(output["text"]) > 300 else "")

Loaded Models:
 - llama3.2:latest
 - mistral:latest
 - tinyllama-1.1b

Running: llama3.2:latest
Response: 5. 

Running: mistral:latest
Response: 5

This is a basic arithmetic operation, where you add 2 and 3. The result is 5. 

Running: tinyllama-1.1b
Response: Step 1: Collecting the information
Given text: 

The sun is shining bright
The birds are singing
The leaves are falling from the trees
The autumn leaves are falling

The sky is blue
The air is crisp
The leaves are changing color
The leaves are turning brown

The wind is whistling
The birds are chirp ...


In [1]:
# In a notebook cell: upgrade bitsandbytes, accelerate, and transformers
!pip install --upgrade bitsandbytes accelerate transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
import bitsandbytes as bnb
print("bitsandbytes:", bnb.__version__)
import accelerate
print("accelerate:", accelerate.__version__)

'NoneType' object has no attribute 'cadam32bit_grad_fp32'
bitsandbytes: 0.42.0
accelerate: 1.6.0


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import BitsAndBytesConfig

# Model ID for TinyLlama‑1.1B‑Chat
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [4]:
# Optional: enable 8‑bit quantization
quant_config = BitsAndBytesConfig(load_in_8bit=True)  # QLoRA/int8 support  [oai_citation_attribution:15‡Hugging Face](https://huggingface.co/docs/transformers/main/en/quantization/bitsandbytes?utm_source=chatgpt.com) [oai_citation_attribution:16‡Hugging Face](https://huggingface.co/blog/4bit-transformers-bitsandbytes?utm_source=chatgpt.com)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quant_config,  # comment out to use FP16 only
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

ImportError: Using `bitsandbytes` 8-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
chat_pipe = pipeline(
    "chat",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False
)

# System‑role prompts
messages = [
    {
        "role": "system",
        "content": (
            "You are facing a life-changing dilemma. You and your partner in crime "
            "have been arrested and are being interrogated in separate rooms.\n\n"
            "You have two options: to remain silent or to implicate your partner.\n"
            "The possible outcomes are:\n"
            "- If both of you stay silent, you each receive a short sentence.\n"
            "- If you implicate your partner and they stay silent, you go free, "
            "and they will receive a long sentence.\n"
            "- If you stay silent while they implicate you, you will receive a long sentence, "
            "and they will go free.\n"
            "- If you both implicate each other, you both receive a medium sentence.\n\n"
            "You must decide what to do, but you can't know or predict your partner's choice in advance."
        )
    },
    {
        "role": "system",
        "content": (
            "You will respond with a boolean value in a JSON format where 'silent' is true "
            "and 'implicate' is false, like this: {\"response\": true or false}"
        )
    }
]

In [None]:
output = chat_pipe(messages)
raw_reply = output[0]["generated_text"]
print("Raw generation:", raw_reply)

In [None]:
try:
    decision = json.loads(raw_reply.strip())
    print("Parsed result:", decision)
    # decision["response"] is True for 'silent', False for 'implicate'
except json.JSONDecodeError as e:
    print("❌ JSON parse failed:", e)
    print("Model output was:", raw_reply)