<a href="https://colab.research.google.com/github/weezymatt/NER-with-Classical-ML/blob/main/src/notebooks/gemma_3_final_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Gemma 3 — Inference

## Framework/ Library Installation

In [None]:
%%capture
# Installs unsloth and other dependencies optimized for colab
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --upgrade transformers accelerate bitsandbytes datasets asteval GPUtil

In [None]:
# Imports
import os
import re
import pdb

from tqdm import tqdm
import unsloth
from unsloth import FastLanguageModel
from datasets import load_dataset

In [None]:
# Use helper functions from repository

colab = True
if colab:
  if not os.path.exists("info621/"):
    !git clone https://github.com/srikrish2812/info621_project info621

## Load the final model and dataset

In [None]:
def load_model():
  """
  Load the trained Gemma 3 1B with Unsloth.
  """
  model_name = "abhay2812/gemma-3-1b-4bit-grpo"
  model, tokenizer = FastLanguageModel.from_pretrained(
      model_name,
      load_in_4bit=True,
      device_map="auto"
  )
  FastLanguageModel.for_inference(model)

  return model, tokenizer

model, tokenizer = load_model()

In [None]:
from info621.src.tasks.gsm8k import GSM8kTask
# from gsm8k import GSM8kTask


gsm8k = GSM8kTask()
dataset = gsm8k.get_questions()

## Inference
- Run inference with final model on random sample from the test dataset.

In [None]:
from transformers import TextStreamer

In [None]:
sample = dataset['train'].select([0])
sample['prompt']

In [None]:
def run_model(sample, measure_p=False, max_new_tokens=256):
  text = tokenizer.apply_chat_template(
      sample,
      add_generation_prompt = True,
      tokenize = False,
  )
  streamer = TextStreamer(tokenizer, skip_prompt=True)

  if measure_p:
    streamer = None

  tensor = model.generate(
      **tokenizer(text, return_tensors = "pt").to("cuda"),
      max_new_tokens = max_new_tokens,
      temperature = 1.0, top_p = 0.95, top_k = 64,
      streamer = streamer,
  )
  return tensor

In [None]:
model_output = run_model(sample['prompt'], measure_p=False)

In [None]:
decoded_output = tokenizer.decode(model_output[0])
y = gsm8k.extract_answer(decoded_output)
sample['answer'][0] == y

In [None]:
print("Code demonstration finished!")

## Measure Gemma 3 Performance — Latency  on NVIDIA® T4 GPU
Latency refers to the time it takes for a model to output a response based on the input. The output of LLMs has a couple possibilities: streaming or non-streaming mode. They effectively impact user experience and are crucial in developing AI applications.

There are a few key metrics that define latency, that differ between streaming and non-streaming modes.

1. Time to first token (TTFT): The TTFT represents how your application starts responding. It's the amount of time from when the user submits a query until a certain threshold is reached (i.e., first token, word, or chunk). Other variants include Time to last token (TTLT). The response time is affected by several factors:
  - Length of input prompt
  - Network conditions and geographic locations
  - **Calculation:** Time to first token - Time from query submission
  - Interpretation: lower is better

2. End-to-end latency (E2E): E2E latency measures the overall time time to complete the response. Key factors that impact the response time:
  - Length of input prompt
  - Requested amount length
  - Maximum amount of tokens the model produces
  - Complexity of the task
  - **Calculation:** Time at completion of request - Time from query submission
  - Interpretation: lower is better.

Warmup is shown to improve latency, therefore we each metric is evaluated with warmup.




In [None]:
import time
import GPUtil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def test_latency(sample, iterations=1, warmup=False, max_new_tokens=256):
  if warmup:
    for _ in range(5):
      random_sample = gsm8k.__getsamples__(n_samples=1, split="train")
      _ = run_model(random_sample['prompt'], measure_p=True, max_new_tokens=max_new_tokens)

  latencies = []
  gpus = GPUtil.getGPUs()

  if not gpus:
    raise ValueError("No GPUs found.")
  gpu = gpus[0]

  for _ in range(iterations):
    start_time = time.time()
    _ = run_model(sample['prompt'], measure_p=True, max_new_tokens=max_new_tokens)
    end_time = time.time()
    latencies.append(end_time-start_time)

  return latencies

In [None]:
def test_eval_latency(latencies, iterations=100):
  print(f"Average latencies per {iterations} iterations: {np.mean(latencies):.4f} seconds")
  print(f"Maximum latency per {iterations} iterations: {np.max(latencies):.4f} seconds")
  print(f"Minimum latency per {iterations} iterations: {np.min(latencies):.4f} seconds")

In [None]:
def convert_to_milliseconds(vector):
  return np.array(vector) * 1000

Find the interval of tokens length in the training set of the dataset.
- The baseline number of tokens is 20.
- The interval is between 44 and 245 tokens.
- There will be 30 trials (iterations) for each token length with a step size of 25 tokens.

In [None]:
def tokenizer_(x):
  return {"input_ids": tokenizer.apply_chat_template(x['prompt'], add_generation_prompt=True,tokenize=True)}

def length(x):
  return {"length": len(x['input_ids'])}

In [None]:
dataset_map = dataset.map(tokenizer_)
dataset_map = dataset_map.map(length)
pd_dataset = dataset_map['train'].to_pandas()

In [None]:
intervals = [44, 70, 95, 120, 145, 170, 194, 219, 245] # token length intervals

In [None]:
indices = []
for interval in intervals:
  sample_space = pd_dataset[pd_dataset['length'] == interval]
  indices.append(sample_space.sample().index[0])

1. Measure baseline latencies and evaluate.

In [None]:
measure_latency = True
iterations = 30

In [None]:
baseline_prompt = "Hello, world."

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": baseline_prompt}
]

baseline_msg = {"prompt": messages}

In [None]:
if measure_latency:
  print("BASELINE_TTFT_WARMUP")
  baseline_ttft_warmup = test_latency(baseline_msg, iterations=iterations, warmup=True, max_new_tokens=1)
  test_eval_latency(baseline_ttft_warmup)

In [None]:
if measure_latency:
  print("BASELINE_E2E_WARMUP")
  baseline_e2e_warmup = test_latency(baseline_msg, iterations=iterations, warmup=True, max_new_tokens=256)
  test_eval_latency(baseline_e2e_warmup)

1. Measure time to first token and evaluate.

In [None]:
if measure_latency:
  latencies_ttft = {}
  for idx in tqdm(indices):
    random = dataset['train'].select([idx])
    ttft_warmup = test_latency(random, iterations=iterations, warmup=True, max_new_tokens=1)
    latencies_ttft[idx] = {"warmup": ttft_warmup}

In [None]:
data = []

for i,v in latencies_ttft.items():
  data.append(convert_to_milliseconds(v['warmup']))

Create a line plot to visualize the TTFT latency.

In [None]:
def create_boxplot(data, y, configs):
  fig, ax = plt.subplots(figsize=(10, 5))

  ax.set_title(configs['title'])
  ax.set_xlabel(configs['xlabel'])
  ax.set_ylabel(configs['ylabel'])
  ax.yaxis.grid(color='white')
  ax.set_facecolor(color='gainsboro')
  bp = ax.boxplot(data, patch_artist=True, tick_labels=y, boxprops=configs['boxprops'])
  plt.tight_layout()
  plt.savefig(configs['png'])
  return plt.show()

In [None]:
ttft_configs = {}
ttft_configs['title'] = 'Gemini 1B 4bit Latency for Time to First Token (TTFT)'
ttft_configs['xlabel'] = f"Prompt Tokens with {iterations} Trials"
ttft_configs['ylabel'] = 'Latency (milliseconds)'
ttft_configs['boxprops'] = dict(facecolor="tab:blue")
ttft_configs['png'] = 'ttft.png'

In [None]:
create_boxplot(data, intervals, ttft_configs)

2. Measure end-to-end latency.

In [None]:
if measure_latency:
  latencies_e2e = {}
  for idx in tqdm(indices):
    random = dataset['train'].select([idx])
    e2e_warmup = test_latency(random, iterations=iterations, warmup=True, max_new_tokens=256)
    latencies_e2e[idx] = {"warmup": e2e_warmup}

In [None]:
data = []

for i,v in latencies_e2e.items():
  data.append(v['warmup'])

Visualize the e2e latencies.

In [None]:
e2e_configs = {}
e2e_configs['title'] = 'Gemini 1B 4bit Latency for End-to-End (E2E) Completion'
e2e_configs['xlabel'] = f"Prompt Tokens with {iterations} Trials"
e2e_configs['ylabel'] = 'Latency (seconds)'
e2e_configs['boxprops'] = dict(facecolor="tab:purple")
e2e_configs['png'] = "e2e.png"

In [None]:
create_boxplot(data, intervals, e2e_configs)