In [1]:
import openai
import pandas as pd
import time
import os
import base64

In [2]:
# from transformers import BitsAndBytesConfig
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Use True for 4-bit, False for 8-bit
#     bnb_4bit_compute_dtype="float16",  # Recommended dtype
#     bnb_4bit_use_double_quant=True,  # Enables double quantization
#     bnb_4bit_quant_type="nf4"  # Normalized float4 (better for LLMs)
# )

In [3]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline
# messages = [
#     {"role": "user", "content": "Who are you?"},
# ]
# pipe = pipeline("text-generation", model="unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit")
# pipe(messages)

### Llama 3.2 11B Vision model in 4bit config generates around 1024 tokens per min with RTX4070 (16GB)
### how to speed up the inference with limited GPU

In [4]:
# # Load model directly
# from transformers import AutoProcessor, AutoModelForImageTextToText

# processor = AutoProcessor.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit")
# model = AutoModelForImageTextToText.from_pretrained("unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit")


# # Define the text input (prompt)
# instruction = "Write a creative story about an astronaut exploring a new planet."

# # Process the text input (no image required)
# inputs = processor(text=instruction, return_tensors="pt")

# # Ensure that input_ids are on the same device as the model
# device = model.device
# inputs = inputs.to(device)  # Move inputs to the correct device

# start_time = time.time()
# # Perform inference using the model
# outputs = model.generate(**inputs,max_new_tokens=1024)
# end_time = time.time()

# # Calculate the time taken
# elapsed_time = end_time - start_time
# print(f"Time taken to generate 1024 tokens: {elapsed_time:.2f} seconds")

# # Decode and print the output
# decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
# print(decoded_output)

In [5]:
from unsloth import FastVisionModel # FastLanguageModel for LLMs
import torch

modelV, tokenizerV = FastVisionModel.from_pretrained(
    "unsloth/Llama-3.2-11B-Vision-Instruct",
    load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA.
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.2.12: Fast Mllama vision patching. Transformers: 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Llama 3.2 11B Vision model (unsloth) in 4bit config generates around 1500 tokens per min with RTX4070 (16GB)
### ### Image + text to text

In [6]:
from PIL import Image

FastVisionModel.for_inference(modelV) # Enable for inference!

image_path = "ex1.jpg"  # Change this to your image path
image = Image.open(image_path).convert("RGB")

instruction = "Describe accurately what you see in this image."

messages = [
    {"role": "user", "content": [
        {"type": "image"},
        {"type": "text", "text": instruction}
    ]}
]
input_text = tokenizerV.apply_chat_template(messages, add_generation_prompt = True)
inputs = tokenizerV(
    image,
    input_text,
    add_special_tokens = False,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizerV, skip_prompt = True)

_ = modelV.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

The image appears to show the rotational forces exerted on an object when the force is applied with the thumb of a hand at point A, as well as an angle θ, a perpendicular line, the point where the force is applied at A, and point Z on the rotational axis. It's difficult to discern the entire image.<|eot_id|>


### Llama 3.2 3B text only model (unsloth) in 4bit config generates around ?? tokens per min with RTX4070 (16GB)
### ### Text to text

In [13]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models
fourbit_models = [
    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.2.12: Fast Llama patching. Transformers: 4.46.1.
   \\   /|    GPU: NVIDIA GeForce RTX 4070 Ti SUPER. Max memory: 15.693 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [18]:
### Chatml style format
### Uses OpenAI’s chatml format, which structures messages in a way that aligns with popular models like Zephyr, Mistral, LLaMA, etc.
### Converts roles (user, assistant) into a standardized format (human, gpt) to match the expected input for the model.
### Ensures that the model properly recognizes the end-of-sentence token (</s>), preventing endless text generation.

from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "chatml", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
    map_eos_token = True, # Maps <|im_end|> to </s> instead
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "Answer the best option only:\ Inertia of a body depends on\
                                a) weight of the object\
                                b) acceleration due to gravity of the planet\
                                c) mass of the object\
                                d) Both a & b"
    },
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 1024, use_cache = True)
tokenizer.batch_decode(outputs)

['<|im_start|>user\nAnswer the best option only : Inertia of a body depends on                                a) weight of the object                                b) acceleration due to gravity of the planet                                c) mass of the object                                d) Both a & b<|im_end|>\n<|im_start|>assistant\nThe best answer is (b) acceleration due to gravity of the planet.<|im_end|>']

In [15]:
### text streamer format
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "write a story about an astronaut"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 1024, use_cache = True)

<|im_start|>user
write a story about an astronaut<|im_end|>
<|im_start|>assistant
As the spaceship soared through the cosmos, astronaut Jack Harris gazed out at the endless expanse of stars and planets. He had spent years training for this moment, and now he was finally on his way to Mars.

Jack's heart swelled with pride as he thought about the incredible journey that had brought him to this point. From his childhood dreams of becoming an astronaut to the grueling training sessions and endless research on the cutting-edge spacecraft, Jack had worked tirelessly to reach his goal.

As he floated in the ship's cramped cabin, Jack felt a sense of peace wash over him. He was surrounded by the familiar sights and sounds of his spacecraft, the silence a comforting respite from the constant hum of machinery.

The ship's computer, an efficient and reliable companion, began to monitor Jack's vital signs and perform routine checks on the ship's systems. Jack's eyes flicked to the viewscreen, whe