Install dependencies

In [1]:
!pip install -qU pyngrok vllm huggingface-hub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.0/169.0 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.6/87.6 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m865.2/865.2 MB[0m [31m792.0 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m92.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m91.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Check GPU utilization

In [2]:
!nvidia-smi

Sat Jul 19 15:19:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Meta's Llama models are gated on HuggingFace and require access. If you've acquired access from Meta on HuggingFace, you'll need to login to your HuggingFace account using your authorized token.

In [3]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Serving our model on vLLM

We start by pulling the chat template from vllm's github - because we want our Llama 3.2 LLM to be able to call functions

In [4]:
!wget https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/examples/tool_chat_template_llama3.1_json.jinja

--2025-07-19 15:20:35--  https://raw.githubusercontent.com/vllm-project/vllm/refs/heads/main/examples/tool_chat_template_llama3.1_json.jinja
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5255 (5.1K) [text/plain]
Saving to: ‘tool_chat_template_llama3.1_json.jinja’


2025-07-19 15:20:35 (69.8 MB/s) - ‘tool_chat_template_llama3.1_json.jinja’ saved [5255/5255]



We start by running `vllm serve <model>` using Python's subprocess library. Set `start_new_session=True` to allow code to continue to run on Jupyter notebooks

In [23]:
import subprocess
model = 'meta-llama/Llama-3.2-3B-Instruct'

# Start vllm server in the background. The default hosting url is http://localhost:8000
vllm_process = subprocess.Popen([
    'vllm',
    'serve',  # Subcommand must follow vllm
    model,
    '--enable-auto-tool-choice',
    '--port', '8000',
    '--tool-call-parser', 'llama3_json',
    '--api-key', 'token-abc123',
    '--chat-template', 'tool_chat_template_llama3.1_json.jinja',
    '--trust-remote-code',
    '--max-model-len', '16548', #reducing the max model length because it exceeded the KV cache limit
], stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True)

We use the following utility function to be able to see the vLLM logs. If there is an error we want to print out the stderr messages for debugging.
> As we are serving the vLLM model we are downloading the model directly from HuggingFace Hub so this can take awhile 😆

In [24]:
import requests
import time
from typing import Tuple
import sys

def check_vllm_status(url: str = "http://localhost:8000/health") -> bool:
    """Check if VLLM server is running and healthy."""
    try:
        response = requests.get(url)
        return response.status_code == 200
    except requests.exceptions.ConnectionError:
        return False

def monitor_vllm_process(vllm_process: subprocess.Popen, check_interval: int = 5) -> Tuple[bool, str, str]:
    """
    Monitor VLLM process and return status, stdout, and stderr.
    Returns: (success, stdout, stderr)
    """
    print("Starting VLLM server monitoring...")

    while vllm_process.poll() is None:  # While process is still running
        if check_vllm_status():
            print("✓ VLLM server is up and running!")
            return True, "", ""

        print("Waiting for VLLM server to start...")
        time.sleep(check_interval)

        # Check if there's any output to display
        if vllm_process.stdout.readable():
            stdout = vllm_process.stdout.read1().decode('utf-8')
            if stdout:
                print("STDOUT:", stdout)

        if vllm_process.stderr.readable():
            stderr = vllm_process.stderr.read1().decode('utf-8')
            if stderr:
                print("STDERR:", stderr)

    # If we get here, the process has ended
    stdout, stderr = vllm_process.communicate()
    return False, stdout.decode('utf-8'), stderr.decode('utf-8')

In [25]:
try:
    success, stdout, stderr = monitor_vllm_process(vllm_process)

    if not success:
        print("\n❌ VLLM server failed to start!")
        print("\nFull STDOUT:", stdout)
        print("\nFull STDERR:", stderr)
        sys.exit(1)

except KeyboardInterrupt:
    print("\n⚠️ Monitoring interrupted by user")
    # # This should just exited the process of probing, not the vllm, if you want it then you coul uncomment this.
    # vllm_process.terminate()
    # try:
    #     vllm_process.wait(timeout=5)
    # except subprocess.TimeoutExpired:
    #     vllm_process.kill()

    stdout, stderr = vllm_process.communicate()
    if stdout: print("\nFinal STDOUT:", stdout.decode('utf-8'))
    if stderr: print("\nFinal STDERR:", stderr.decode('utf-8'))
    sys.exit(0)

Starting VLLM server monitoring...
Waiting for VLLM server to start...
STDOUT: INFO 07-19 15:28:45 [__init__.py:244] Automatically detected platform cuda.

STDERR: 2025-07-19 15:28:37.615035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752938917.635923    4554 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752938917.641959    4554 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-07-19 15:28:37.661978: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlo

In [26]:
!nvidia-smi

Sat Jul 19 15:32:15 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P0             29W /   70W |   12652MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [27]:
import os
from google.colab import userdata

ngrok_auth_token = userdata.get('NGROK_AUTH_TOKEN')

In [28]:
!ngrok config add-authtoken {ngrok_auth_token}

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [29]:
from pyngrok import ngrok

port = 8000
# Open a ngrok tunnel to the HTTP server
public_url = ngrok.connect(port).public_url
print(f" * ngrok tunnel \"{public_url}\" -> \"http://127.0.0.1:{port}\"")

 * ngrok tunnel "https://9398efd88ee3.ngrok-free.app" -> "http://127.0.0.1:8000"


We have our free App!

In [12]:
!nvidia-smi

Sat Jul 19 15:26:35 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   65C    P0             29W /   70W |   12784MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

# Kill process

In [None]:
vllm_process.terminate()
vllm_process.wait()  # Wait for process to terminate

0