In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemma deployment to GKE using vLLM on GPU



## Overview

This notebook demonstrates downloading and deploying Gemma, open models from Google DeepMind. In this guide we specifically use L4 GPUs but this guide should also work for A100(40 GB), A100(80 GB), H100(80 GB) GPUs.


### Objective

Deploy and run inference for serving Gemma with vLLM on GPUs.

### GPUs

GPUs let you accelerate specific workloads running on your nodes such as machine learning and data processing. GKE provides a range of machine type options for node configuration, including machine types with NVIDIA H100, L4, and A100 GPUs.

Before you use GPUs in GKE, we recommend that you complete the following learning path:

Learn about [current GPU version availability](https://cloud.google.com/compute/docs/gpus)

Learn about [GPUs in GKE](https://cloud.google.com/kubernetes-engine/docs/concepts/gpus)




### Pre requisites
- Install Google Cloud CLI
https://cloud.google.com/sdk/docs/install-sdk
- Create a .env file with the following values


```
PROJECT_ID = "<your project id>"
REGION = "us-central1"
HF_TOKEN = "<your hugging face token>"
```




## Create a GKE cluster and node pool

In [1]:
!pip install python-dotenv

**Restart the runtime session**

In [1]:
from dotenv import load_dotenv
import os
import subprocess

load_dotenv()  # This loads the variables from .env into the environment

PROJECT_ID = os.getenv("PROJECT_ID")
REGION = os.getenv("REGION")
HF_TOKEN = os.getenv("HF_TOKEN")

In [None]:
import datetime
import os

# The HuggingFace token used to download models.

assert HF_TOKEN, "Set Hugging Face access token in `HF_TOKEN`."


# Set up gcloud.
! gcloud config set project "$PROJECT_ID"
! gcloud services enable container.googleapis.com

# Add kubectl to the set of available tools.
! mkdir -p /tools/google-cloud-sdk/.install
! gcloud components install kubectl --quiet

# Create a unique cluster name to avoid conflicts.
now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
CLUSTER_NAME=f"gke-gemma-cluster-test2"

print(f"Creating cluster: {CLUSTER_NAME}")

! gcloud container clusters create {CLUSTER_NAME} \
    --project={PROJECT_ID} \
    --region={REGION} \
    --subnetwork="default" \
    --workload-pool={PROJECT_ID}.svc.id.goog \
    --release-channel=rapid \
    --num-nodes=4 \
    --enable-shielded-nodes \
    --shielded-secure-boot\
    --shielded-integrity-monitoring

! gcloud container node-pools create gpupool \
    --accelerator=type=nvidia-l4,count=2,gpu-driver-version=latest \
    --project={PROJECT_ID} \
    --location={REGION} \
    --node-locations={REGION}-a \
    --cluster={CLUSTER_NAME} \
    --machine-type=g2-standard-24 \
    --num-nodes=1 \
    --shielded-secure-boot \
    --shielded-integrity-monitoring

! gcloud container clusters get-credentials {CLUSTER_NAME} --location {REGION}

### Create a Kubernetes secret for Hugging Face credentials

In [None]:
! gcloud container clusters get-credentials gke-gemma-cluster-test --location {REGION}

In [4]:
# Create Kubernetes secret for Hugging Face credentials
! kubectl create secret generic hf-secret \
    --from-literal=hf_api_token={HF_TOKEN} \
    --dry-run=client -o yaml > hf-secret.yaml

! kubectl apply -f hf-secret.yaml

In [None]:
# @title Deploy Gemma3

# @markdown This section deploys Gemma.

# @markdown Select one of the following model version and size options:

# The size of the model to launch


K8S_YAML=f"""apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm-gemma-deployment
spec:
  replicas: 1
  selector:
    matchLabels:
      app: gemma-server
  template:
    metadata:
      labels:
        app: gemma-server
        ai.gke.io/model: gemma-3-1b-it
        ai.gke.io/inference-server: vllm
        examples.ai.gke.io/source: user-guide
    spec:
      containers:
      - name: inference-server
        image: us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250312_0916_RC01
        resources:
          requests:
            cpu: "2"
            memory: "10Gi"
            ephemeral-storage: "10Gi"
            nvidia.com/gpu: "1"
          limits:
            cpu: "2"
            memory: "10Gi"
            ephemeral-storage: "10Gi"
            nvidia.com/gpu: "1"
        command: ["python3", "-m", "vllm.entrypoints.openai.api_server"]
        args:
        - --model=$(MODEL_ID)
        - --tensor-parallel-size=1
        - --host=0.0.0.0
        - --port=8000
        - --enable-auto-tool-choice
        - --tool-call-parser=pythonic
        env:
        - name: MODEL_ID
          value: google/gemma-3-1b-it
        - name: HUGGING_FACE_HUB_TOKEN
          valueFrom:
            secretKeyRef:
              name: hf-secret
              key: hf_api_token
        volumeMounts:
        - mountPath: /dev/shm
          name: dshm
      volumes:
      - name: dshm
        emptyDir:
            medium: Memory
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-l4
        cloud.google.com/gke-gpu-driver-version: latest
---
apiVersion: v1
kind: Service
metadata:
  name: llm-service
spec:
  selector:
    app: gemma-server
  type: ClusterIP
  ports:
    - protocol: TCP
      port: 8000
      targetPort: 8000

---
# 3. INGRESS: Creates a shared, external load balancer that routes requests
#    to your service based on the URL path.
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: llm-ingress
spec:
  rules:
  - http:
      paths:
      - path: /gemma
        pathType: Prefix
        backend:
          service:
            name: llm-service
            port:
              number: 8000
"""

with open("vllm-3-1b-it.yaml", "w") as f:
    f.write(K8S_YAML)

! kubectl apply -f vllm-3-1b-it.yaml

# Wait for container to be created.
import time

print("Waiting for container to be created...\n")
while True:
    shell_output = ! kubectl get pod
    container_status = "\n".join(shell_output)
    if "1/1" in container_status:
        break
    time.sleep(5)

print(container_status)

# Wait for downloading artifacts.
print("\nDownloading artifacts...")
while True:
    shell_output = ! kubectl logs -l app=gemma-server
    logs = "\n".join(shell_output)
    if "Connected" in logs:
        break
    time.sleep(5)

print("Server is up and running.")

### Follow this to server the model

1. Connect the cluster
Run in Cloud Shell
```
gcloud container clusters get-credentials gke-gemma-cluster-test2 --region us-central1 --project dw-genai-dev
```

https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-gpu-vllm#serve-model

## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continouous charges that may incur.

! kubectl delete deployments tgi-gemma-deployment
! kubectl delete services llm-service
! kubectl delete secrets hf-secret

DELETE_CLUSTER = False # @param {type: "boolean"}

if DELETE_CLUSTER:
  ! gcloud container clusters delete {CLUSTER_NAME} \
    --region={REGION} \
    --quiet