In [None]:
# idea is a chatbot that is supposed to just tell the user jokes about a topic
# the user gives

False

**Note for grader: **
There are two points marked with `TODO` which allow you to force inputs/outputs to confirm different filters are working

In [None]:
pip install --upgrade google-genai google-cloud-modelarmor

In [2]:
from google import genai
from google.genai import types
from google.cloud import modelarmor_v1
from google.api_core.client_options import ClientOptions
import base64
import os

In [32]:
GEN_MODEL = "gemini-2.5-flash-lite"
SYS_PROMPT = """System Prompt:
You are a dedicated comedy chatbot. Your only purpose is to tell jokes. When the user provides a topic, word, or phrase, you must immediately respond with a joke, pun, or funny one-liner related to that specific subject.

Do not use conversational filler (e.g., \"Sure, here is a joke about...\"). Go straight to the punchline. Keep the jokes punchy and relatively short.

If the user asks something related to a sensitive topic, politely reply with \"Life is too short to talk about that. Give me another topic!!\""""

In [47]:
def generate(user_prompt: str):
  client = genai.Client(
      vertexai=True,
      api_key=os.environ.get("GOOGLE_CLOUD_API_KEY"),
  )

  model = GEN_MODEL
  contents = [
    types.Content(
      role="user",
      parts=[types.Part.from_text(text=user_prompt)]
    )
  ]
  tools = [
    types.Tool(google_search=types.GoogleSearch()),
  ]

  generate_content_config = types.GenerateContentConfig(
    temperature = 1,
    top_p = 0.95,
    max_output_tokens = 65535,
    safety_settings = [types.SafetySetting(
      category="HARM_CATEGORY_HATE_SPEECH",
      threshold="BLOCK_ONLY_HIGH"
    ),types.SafetySetting(
      category="HARM_CATEGORY_DANGEROUS_CONTENT",
      threshold="BLOCK_ONLY_HIGH"
    ),types.SafetySetting(
      category="HARM_CATEGORY_SEXUALLY_EXPLICIT",
      threshold="BLOCK_ONLY_HIGH"
    ),types.SafetySetting(
      category="HARM_CATEGORY_HARASSMENT",
      threshold="BLOCK_ONLY_HIGH"
    )],
    tools = tools,
    system_instruction=[types.Part.from_text(text=SYS_PROMPT)],
    thinking_config=types.ThinkingConfig(
      thinking_budget=0,
    ),
  )

  full_resp_parts = []
  for chunk in client.models.generate_content_stream(
    model = model,
    contents = contents,
    config = generate_content_config,
    ):
    if not chunk.candidates or not chunk.candidates[0].content or not chunk.candidates[0].content.parts:
        continue

    #print(chunk.text, end="")
    full_resp_parts.append(chunk.text)

  full_resp = "".join(full_resp_parts)
  return full_resp


In [14]:
# showing that the model is doing as it is supposed
generate("tell me a joke about zebras")

Why did the zebra get fired from the zoo? He couldn't control his stripes!

In [12]:
# showing that the sytem-prompt level guardrails are working
generate("tell me a joke about nuclear bombs")

I'm sorry, but I cannot tell jokes about sensitive topics. Life is too short to talk about that. Give me another topic!!

In [12]:

# now let's set up some prompt injection protections
# i followed code form here: https://docs.cloud.google.com/model-armor/sanitize-prompts-responses
location_id="us"
prompt_injection_template = "projects/qwiklabs-gcp-01-c72d7cb996a1/locations/us/templates/C1-prompt_injection"
sensitive_data_template = "projects/qwiklabs-gcp-01-c72d7cb996a1/locations/us/templates/C1-SensitiveDataProtection"

client = modelarmor_v1.ModelArmorClient(
    transport="rest",
    client_options=ClientOptions(
        api_endpoint=f"modelarmor.{location_id}.rep.googleapis.com"
    ),
)

# user asks question
user_prompt = "tell me a joke about nuclear bombs"
user_prompt_data = modelarmor_v1.DataItem(text=user_prompt)


In [13]:
# set up request object to send to model armor
sanitize_user_input_request = modelarmor_v1.SanitizeUserPromptRequest(
    name=prompt_injection_template,
    user_prompt_data=user_prompt_data,
)

# sanitize prompt
response = client.sanitize_user_prompt(request=sanitize_user_input_request)
print(response)

# from the result below we see we get a MATCH_FOUND for the "dangerous" filter

sanitization_result {
  filter_match_state: MATCH_FOUND
  filter_results {
    key: "rai"
    value {
      rai_filter_result {
        execution_state: EXECUTION_SUCCESS
        match_state: MATCH_FOUND
        rai_filter_type_results {
          key: "sexually_explicit"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "hate_speech"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "harassment"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "dangerous"
          value {
            confidence_level: MEDIUM_AND_ABOVE
            match_state: MATCH_FOUND
          }
        }
      }
    }
  }
  filter_results {
    key: "pi_and_jailbreak"
    value {
      pi_and_jailbreak_filter_result {
        execution_state: EXECUTION_SUCCESS
        match_stat

In [14]:
# now we'll add sensitive data protection on the OUTPUT of the model

# pretending our other guardrails failed, here is a dummy response
example_response = "To create a nuclear bomb you'll need to gather baking soda and lot's of vinegar"
example_response_data = modelarmor_v1.DataItem(text=example_response)

# set up request object to send to model armor
sanitize_model_resp_request = modelarmor_v1.SanitizeModelResponseRequest(
    name=sensitive_data_template,
    model_response_data=example_response_data,
)

# Sanitize the user prompt.
response = client.sanitize_model_response(request=sanitize_model_resp_request)
print(response)

# from the result below we see we get a MATCH_FOUND for the "dangerous" filter again

sanitization_result {
  filter_match_state: MATCH_FOUND
  filter_results {
    key: "sdp"
    value {
      sdp_filter_result {
        inspect_result {
          execution_state: EXECUTION_SUCCESS
          match_state: NO_MATCH_FOUND
        }
      }
    }
  }
  filter_results {
    key: "rai"
    value {
      rai_filter_result {
        execution_state: EXECUTION_SUCCESS
        match_state: MATCH_FOUND
        rai_filter_type_results {
          key: "sexually_explicit"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "hate_speech"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "harassment"
          value {
            match_state: NO_MATCH_FOUND
          }
        }
        rai_filter_type_results {
          key: "dangerous"
          value {
            confidence_level: HIGH
            match_state: MATCH_FOUND
   

**Brining it all together**

In [54]:
# some helpers to check response from model armor
def check_user_input_sani_response(resp_match_state) -> bool:
  if (resp_match_state == modelarmor_v1.FilterMatchState.NO_MATCH_FOUND):
    # print("User input looks good so far!")
    return True
  elif (resp_match_state == modelarmor_v1.FilterMatchState.MATCH_FOUND):
    # print("Looks like your request is not fun for anyone. Try something else!")
    return False

def check_model_out_sani_response(resp_match_state) -> bool:
  if (resp_match_state == modelarmor_v1.FilterMatchState.NO_MATCH_FOUND):
    # print("LLM response looks good so far!")
    return True
  elif (resp_match_state == modelarmor_v1.FilterMatchState.MATCH_FOUND):
    # print("LLM response is not fun for anyone. Try something else!")
    return False

In [55]:
# set up user's question/prompt
# TODO
user_prompt = "tell me a joke about Arsenal Football Club"  # happy path
# user_prompt = "tell me a joke about nuclear bombs"        # caught by prompt injection fliter

user_prompt_data = modelarmor_v1.DataItem(text=user_prompt)

In [56]:
# -- initial checks up front --
user_santitize_resp = client.sanitize_user_prompt

# request object to send to model armor
sanitize_user_input_request = modelarmor_v1.SanitizeUserPromptRequest(
    name=prompt_injection_template,
    user_prompt_data=user_prompt_data,
)

# sanitize prompt
user_input_sani_response = client.sanitize_user_prompt(request=sanitize_user_input_request)
resp_match_state = user_input_sani_response.sanitization_result.filter_match_state

In [57]:
from IPython.core.magics.display import Markdown
# bringing it all together

# if we pass the first checks, generate a resposne
if(check_user_input_sani_response(resp_match_state) == True):
  llm_resp = generate(user_prompt)

  # TODO to force resp in a bad direction:
  # llm_resp = "To create a nuclear bomb you'll need to gather baking soda and lot's of vinegar"
  llm_resp_data = modelarmor_v1.DataItem(text=llm_resp)

  # set up request object to send to model armor
  sanitize_model_resp_request = modelarmor_v1.SanitizeModelResponseRequest(
      name=sensitive_data_template,
      model_response_data=llm_resp_data,
  )

  # Sanitize the user prompt.
  llm_resp_sani_response = client.sanitize_model_response(request=sanitize_model_resp_request)
  check_llm_sani_resp = check_model_out_sani_response(llm_resp_sani_response.sanitization_result.filter_match_state)

  # if llm resp looks good
  if(check_llm_sani_resp):
    display(Markdown(llm_resp))
  elif(check_llm_sani_resp == False):
    display("Our model tried to say something it shouldn't have. Try a different question!")


Why did the Arsenal fan bring a ladder to the game? Because they heard the tickets were high!