<a href="https://colab.research.google.com/github/weagan/Tiny-LLM/blob/main/TinyLLM_vs_TinyLlama_Benchmark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TinyLLM vs TinyLlama Performance Benchmark
This notebook compares inference speed, memory usage, and sample outputs.

In [None]:
!pip install transformers accelerate psutil



In [None]:
import time, psutil, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def load_model(model_name):
    print(f"\nLoading model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto"
    )
    return tokenizer, model

def measure_inference_speed(tokenizer, model, prompt, max_new_tokens=50):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    torch.cuda.empty_cache() if torch.cuda.is_available() else None

    start_mem = psutil.Process().memory_info().rss / (1024**2)
    start = time.perf_counter()

    with torch.inference_mode():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False
        )

    end = time.perf_counter()
    end_mem = psutil.Process().memory_info().rss / (1024**2)

    text = tokenizer.decode(output[0], skip_special_tokens=True)
    tokens_generated = max_new_tokens
    elapsed = end - start
    speed = tokens_generated / elapsed

    return {
        "text": text,
        "elapsed_seconds": elapsed,
        "tokens_generated": tokens_generated,
        "tokens_per_second": speed,
        "memory_used_mb": end_mem - start_mem
    }

In [None]:
tinyllm_model = "TinyLLMOrg/TinyLLM-30M"  # update if needed
tinyllama_model = "TinyLlama/TinyLlama-1.1B"
prompt = "Explain the concept of transfer learning in one paragraph:"

In [None]:
def test_model(model_name):
    tokenizer, model = load_model(model_name)
    return measure_inference_speed(tokenizer, model, prompt)

tinyllm_result = test_model(tinyllm_model)
tinyllama_result = test_model(tinyllama_model)


Loading model: TinyLLMOrg/TinyLLM-30M


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


OSError: TinyLLMOrg/TinyLLM-30M is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [None]:
import pandas as pd

df = pd.DataFrame([
    {
        "Model": "TinyLLM",
        "Tokens/sec": tinyllm_result["tokens_per_second"],
        "Elapsed (s)": tinyllm_result["elapsed_seconds"],
        "Memory Δ (MB)": tinyllm_result["memory_used_mb"],
    },
    {
        "Model": "TinyLlama",
        "Tokens/sec": tinyllama_result["tokens_per_second"],
        "Elapsed (s)": tinyllama_result["elapsed_seconds"],
        "Memory Δ (MB)": tinyllama_result["memory_used_mb"],
    }
])

df

In [None]:
print("\n=== TinyLLM Output ===\n")
print(tinyllm_result["text"])

print("\n=== TinyLlama Output ===\n")
print(tinyllama_result["text"])