In [4]:
import datetime
from google import genai
from google.genai import types
import vertexai
from vertexai.evaluation import EvalTask, MetricPromptTemplateExamples
import pandas as pd
import datetime
import os

In [6]:
client = genai.Client(vertexai=True, project=os.environ.get("GOOGLE_CLOUD_PROJECT"), location="us-central1")

In [7]:
def classify_question(question):
  response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=f"""Context: You categorize government service questions.
Output only one of: Employment, General Information, Emergency Services, Tax Related

Question: {question}
Category:"""
  )
  return response.text.strip()

In [8]:
def generate_gov_post(topic):
  response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=f"""Context: You write social media posts for Springfield City Government.
1. Keep posts under 200 characters
2. Include the hashtag #SpringfieldGov at the end

Input: Write a post about city offices closed for Thanksgiving
Output: City offices closed Thu & Fri for Thanksgiving. Have a safe holiday! #SpringfieldGov

Input: {topic}
Output:"""
  )
  return response.text.strip()

In [None]:
import unittest

class TestClassification(unittest.TestCase):
    def test_employment(self):
        response = classify_question("How do I apply for unemployment benefits?")
        self.assertEqual(response, "Employment")

    def test_general_info(self):
        response = classify_question("What are the library hours?")
        self.assertEqual(response, "General Information")

    def test_emergency(self):
        response = classify_question("I need to report a car accident!")
        self.assertEqual(response, "Emergency Services")

    def test_tax(self):
        response = classify_question("When is the property tax deadline?")
        self.assertEqual(response, "Tax Related")

unittest.main(argv=[''], verbosity=2, exit=False)

In [None]:
def does_post_follow_rules(post):
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=f"""Does this post follow these rules:
1. Under 200 characters
2. Includes hashtag #SpringfieldGov at the end

Only return Yes or No

Post: {post}
Output:"""
    )
    return response.text.strip()

In [None]:
class TestSocialMediaPosts(unittest.TestCase):
    def test_snow_emergency_follows_rules(self):
        post = generate_gov_post("Write a post about school closings due to snow")
        result = does_post_follow_rules(post)
        self.assertEqual(result, "Yes")

    def test_holiday_follows_rules(self):
        post = generate_gov_post("Write a post about Memorial Day office closures")
        result = does_post_follow_rules(post)
        self.assertEqual(result, "Yes")

    def test_bad_post_fails(self):
        bad_post = "City offices are closed tomorrow for the holiday."
        result = does_post_follow_rules(bad_post)
        self.assertEqual(result, "No")

unittest.main(argv=[''], verbosity=2, exit=False)

In [9]:
prompts = [
    "Write a post about a weather emergency",
    "Write a post about a holiday closure",
    "Write a post about road construction",
]

eval_data = []
for p in prompts:
    response = generate_gov_post(p)
    eval_data.append({"prompt": p, "response": response})

eval_dataset = pd.DataFrame(eval_data)
print(eval_dataset)

                                   prompt  \
0  Write a post about a weather emergency   
1    Write a post about a holiday closure   
2    Write a post about road construction   

                                            response  
0  Weather emergency declared. Stay safe, follow ...  
1  City offices closed today for the holiday. Reo...  
2  Road construction starts next Monday on Main S...  


In [11]:
eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=[
        MetricPromptTemplateExamples.Pointwise.FLUENCY,
        MetricPromptTemplateExamples.Pointwise.COHERENCE,
    ],
    experiment="gov-social-posts",
)

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
result = eval_task.evaluate(experiment_run_name=f"eval-run-{run_ts}")

print("Summary Metrics:")
print(result.summary_metrics)
print("\nDetailed Results:")
display(result.metrics_table)


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 6 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 6/6 [00:06<00:00,  1.13s/it]
INFO:vertexai.evaluation._evaluation:All 6 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:6.777848254998389 seconds


Summary Metrics:
{'row_count': 3, 'fluency/mean': np.float64(4.666666666666667), 'fluency/std': 0.5773502691896258, 'coherence/mean': np.float64(5.0), 'coherence/std': 0.0}

Detailed Results:


Unnamed: 0,prompt,response,fluency/explanation,fluency/score,coherence/explanation,coherence/score
0,Write a post about a weather emergency,"Weather emergency declared. Stay safe, follow ...","The response is free of grammatical errors, us...",5.0,The response is exceptionally coherent for a s...,5.0
1,Write a post about a holiday closure,City offices closed today for the holiday. Reo...,The response is concise and effectively convey...,4.0,The response is a perfectly coherent and conci...,5.0
2,Write a post about road construction,Road construction starts next Monday on Main S...,The response is free of grammatical errors (co...,5.0,The response demonstrates exceptional coherenc...,5.0


In [12]:
test_questions = [
    {"prompt": "How do I get unemployment?", "reference": "Employment"},
    {"prompt": "Where is city hall?", "reference": "General Information"},
    {"prompt": "There's a fire!", "reference": "Emergency Services"},
]

simple_responses = []
for q in test_questions:
    resp = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=f"Classify as Employment, General Information, Emergency Services, or Tax Related: {q['prompt']}"
    ).text.strip()
    simple_responses.append(resp)

detailed_responses = [classify_question(q["prompt"]) for q in test_questions]

compare_df = pd.DataFrame({
    "prompt": [q["prompt"] for q in test_questions],
    "reference": [q["reference"] for q in test_questions],
    "response": detailed_responses,
    "baseline_model_response": simple_responses,
})

print(compare_df)

compare_task = EvalTask(
    dataset=compare_df,
    metrics=["exact_match"],
    experiment="prompt-comparison",
)

compare_result = compare_task.evaluate(experiment_run_name=f"compare-{run_ts}")
print("\nComparison Results:")
print(compare_result.summary_metrics)
display(compare_result.metrics_table)

                       prompt            reference             response  \
0  How do I get unemployment?           Employment           Employment   
1         Where is city hall?  General Information  General Information   
2             There's a fire!   Emergency Services   Emergency Services   

  baseline_model_response  
0          **Employment**  
1     General Information  
2  **Emergency Services**  


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 3 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 3/3 [00:00<00:00, 12.39it/s]
INFO:vertexai.evaluation._evaluation:All 3 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:0.25004661900311476 seconds



Comparison Results:
{'row_count': 3, 'exact_match/mean': np.float64(1.0), 'exact_match/std': 0.0}


Unnamed: 0,prompt,reference,response,baseline_model_response,exact_match/score
0,How do I get unemployment?,Employment,Employment,**Employment**,1.0
1,Where is city hall?,General Information,General Information,General Information,1.0
2,There's a fire!,Emergency Services,Emergency Services,**Emergency Services**,1.0
