In [1]:
!pip install --upgrade google-cloud-aiplatform google-cloud-logging --quiet
!pip install "google-cloud-aiplatform[evaluation]" --quiet


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m229.5/229.5 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.6/65.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.6/40.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.6/118.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.1/739.1 kB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import pandas as pd
import logging
import google.cloud.logging
from IPython.display import display, Markdown

import vertexai
from vertexai.generative_models import GenerativeModel, GenerationConfig
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    EvalTask,
    PairwiseMetric,
    PointwiseMetric,
)

# Do not remove logging section
client = google.cloud.logging.Client()
client.setup_logging()

pd.set_option("display.max_colwidth", None)


In [2]:
PROJECT_ID = "qwiklabs-gcp-00-a45b05279191"
LOCATION = "us-central1"

import vertexai

# Initialize vertexai
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Do not remove logging section
log_message = f"Vertex AI initialize: {vertexai}"
logging.info(log_message)


INFO:root:Vertex AI initialize: <module 'vertexai' from '/usr/local/lib/python3.11/dist-packages/vertexai/__init__.py'>


In [3]:
prompt_template="# System_prompt\n{system_prompt} # Question\n{question} # Description {description}"


In [4]:
system_prompt=["You are an retail domestic merchandise expert"]

question=["Provide a one sentence summary of the following text"]

description=[
  "Men’s Blue Dress Shorts Elevate your warm-weather wardrobe with these tailored men's blue dress shorts — where polished style meets everyday comfort. Designed ...",
  "Summer Floral Dress. Breathe life into your summer wardrobe with this effortlessly elegant floral midi dress. Crafted from lightweight, breathable fabric, ...",
  "Outdoor Garden Furniture Transform your backyard into a personal oasis with this elegant garden furniture set designed for comfort, durability, and timeless style. ...",
  "OLED 4K Ultra HD Smart TV. Step into the future of home entertainment with breathtaking clarity, vibrant color, and cinematic sound. ...",
  "Smartwash Dishwasher. Let your kitchen work for you. Say goodbye to scrubbing and soaking — the SmartWash Dishwasher delivers a powerful, whisper-quiet clean that saves you time, energy, and water. ..."
]


In [5]:
flash_model = GenerativeModel(
    model_name="gemini-2.0-flash",
    generation_config=GenerationConfig(temperature=0),
)


In [6]:
llm_response = flash_model.generate_content(
    prompt_template.format(
        system_prompt=system_prompt[0],
        question=question[0],
        description=description[1]
    )
)
display(Markdown(llm_response.text))

# Do not remove logging section
log_message = f"Markdown output: {llm_response.text}"
logging.info(log_message)


This summer floral midi dress is a lightweight and elegant addition to your wardrobe.


INFO:root:Markdown output: This summer floral midi dress is a lightweight and elegant addition to your wardrobe.



In [7]:
import pandas as pd

dataset=pd.DataFrame(
    {
        "system_prompt": system_prompt*5,
        "question": question*5,
        "description": description
    }
)


In [13]:
import pandas as pd
import datetime
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    PointwiseMetric,
    EvalTask
)
from vertexai.generative_models import GenerativeModel

# 1. Prepare the evaluation dataset
dataset = pd.DataFrame({
    "system_prompt": [
        "You are a helpful assistant that summarizes product reviews.",
        "You are an expert summarizer for customer feedback."
    ],
    "question": [
        "Summarize the following product review: 'Great camera but battery drains quickly.'",
        "Summarize this customer comment: 'Loved the speed, disliked the interface.'"
    ],
    "description": [
        "The summary should be concise and cover the main positive and negative points.",
        "Generate a short, clear summary for internal team review."
    ]
})

# 2. Set up the metric
POINTWISE_METRIC = PointwiseMetric(
    metric="summarization_quality",
    metric_prompt_template=MetricPromptTemplateExamples.Pointwise.SUMMARIZATION_QUALITY
)

# 3. Create the EvalTask
pointwise_eval_task = EvalTask(
    dataset=dataset,
    metrics=[POINTWISE_METRIC],
    experiment="product-summarization-quality"
)

# 4. Evaluate the Gemini 2.0 Flash model
model = GenerativeModel(model_name="gemini-2.0-flash")
run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
pointwise_result = pointwise_eval_task.evaluate(
    model=model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question} # Description {description}",
    experiment_run_name=f"prod-sumq-{run_ts}"
)

# 5. Show results
print("Summary metrics:")
print(pointwise_result.summary_metrics)
print("\nFull metrics table:")
print(pointwise_result.metrics_table)


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '# System_prompt\n{system_prompt} # Question\n{question} # Description {description}', 'model_name': 'publishers/google/models/gemini-2.0-flash'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 2 responses from Gemini model gemini-2.0-flash.
100%|██████████| 2/2 [00:00<00:00,  4.56it/s]
INFO:vertexai.evaluation._evaluation:All 2 responses are successfully generated from Gemini model gemini-2.0-flash.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 0.4557281419999981 seconds.
INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 2 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 2/2 [00:00<00:00,  2.47it/s]
INFO:vertexai.evaluation._eva

Summary metrics:
{'row_count': 2, 'summarization_quality/mean': np.float64(2.0), 'summarization_quality/std': 2.8284271247461903}

Full metrics table:
                                                  system_prompt  \
0  You are a helpful assistant that summarizes product reviews.   
1           You are an expert summarizer for customer feedback.   

                                                                             question  \
0  Summarize the following product review: 'Great camera but battery drains quickly.'   
1         Summarize this customer comment: 'Loved the speed, disliked the interface.'   

                                                                      description  \
0  The summary should be concise and cover the main positive and negative points.   
1                       Generate a short, clear summary for internal team review.   

                                                                                                                           

In [14]:
flash_lite_model = GenerativeModel(
    model_name="gemini-2.0-flash-lite",
    generation_config=GenerationConfig(temperature=0),
)


In [17]:
import pandas as pd
import datetime
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    PairwiseMetric,
    EvalTask,
)
from vertexai.generative_models import GenerativeModel

dataset = pd.DataFrame({
    "system_prompt": [
        "You are a helpful assistant that summarizes product reviews.",
        "You are an expert summarizer for customer feedback.",
    ],
    "question": [
        "Summarize the following product review: 'Great camera but battery drains quickly.'",
        "Summarize this customer comment: 'Loved the speed, disliked the interface.'"
    ],
    "description": [
        "The summary should be concise and cover the main positive and negative points.",
        "Generate a short, clear summary for internal team review."
    ]
})

candidate_model = GenerativeModel(model_name="gemini-2.0-flash")
baseline_model = GenerativeModel(model_name="gemini-2.0-flash-lite")

PAIRWISE_METRIC = PairwiseMetric(
    metric="summarization_quality",
    metric_prompt_template=MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY,
    baseline_model=baseline_model,
)

pairwise_eval_task = EvalTask(
    dataset=dataset,
    metrics=[PAIRWISE_METRIC],
    experiment="pairwise-product-summarization-quality"
)

run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
pairwise_result = pairwise_eval_task.evaluate(
    model=candidate_model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question} # Description {description}",
    experiment_run_name=f"pairwise-prod-sumq-{run_ts}"
)

print("Summary metrics:")
print(pairwise_result.summary_metrics)

print("\nFull metrics table:")
print(pairwise_result.metrics_table)

# Identify and print the preferred response column
preferred_col = None
for c in ["preferred_response", "winner", "chosen_model"]:
    if c in pairwise_result.metrics_table.columns:
        preferred_col = c
        break
if preferred_col:
    print(f"\nPreferred response column ('{preferred_col}'):")
    print(pairwise_result.metrics_table[preferred_col])
else:
    print("\nPreferred response column not found.")

# Identify and print the explanation column
reason_col = None
for c in ["explanation", "rationale", "choice_reason"]:
    if c in pairwise_result.metrics_table.columns:
        reason_col = c
        break
if reason_col:
    print(f"\nModel explanations ('{reason_col}'):")
    print(pairwise_result.metrics_table[reason_col])
else:
    print("\nExplanation column not found.")


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '# System_prompt\n{system_prompt} # Question\n{question} # Description {description}', 'model_name': 'publishers/google/models/gemini-2.0-flash'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 2 responses from Gemini model gemini-2.0-flash.
100%|██████████| 2/2 [00:01<00:00,  1.34it/s]
INFO:vertexai.evaluation._evaluation:All 2 responses are successfully generated from Gemini model gemini-2.0-flash.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 1.5017507539996586 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 2 responses from Gemini model gemini-2.0-flash-lite.
100%|██████████| 2/2 [00:00<00:00,  4.95it/s]
INFO:vertexai.evaluation._evaluation:

Summary metrics:
{'row_count': 2, 'summarization_quality/candidate_model_win_rate': np.float64(0.0), 'summarization_quality/baseline_model_win_rate': np.float64(1.0)}

Full metrics table:
                                                  system_prompt  \
0  You are a helpful assistant that summarizes product reviews.   
1           You are an expert summarizer for customer feedback.   

                                                                             question  \
0  Summarize the following product review: 'Great camera but battery drains quickly.'   
1         Summarize this customer comment: 'Loved the speed, disliked the interface.'   

                                                                      description  \
0  The summary should be concise and cover the main positive and negative points.   
1                       Generate a short, clear summary for internal team review.   

                                                                                      

In [18]:
import pandas as pd

dataset = pd.DataFrame({
    "system_prompt": [
        "You are a helpful assistant that summarizes product reviews.",
        "You are an expert summarizer for customer feedback.",
    ],
    "question": [
        "Summarize the following product review: 'Great camera but battery drains quickly.'",
        "Summarize this customer comment: 'Loved the speed, disliked the interface.'"
    ],
    "description": [
        "The summary should be concise and cover the main positive and negative points.",
        "Generate a short, clear summary for internal team review."
    ]
})
from vertexai.evaluation import (
    MetricPromptTemplateExamples,
    PairwiseMetric,
    EvalTask,
)
from vertexai.generative_models import GenerativeModel
import datetime

# Candidate: gemini-2.0-flash
candidate_model = GenerativeModel(model_name="gemini-2.0-flash")
# Baseline: gemini-2.0-flash-lite
baseline_model = GenerativeModel(model_name="gemini-2.0-flash-lite")

PAIRWISE_METRIC = PairwiseMetric(
    metric="summarization_quality",
    metric_prompt_template=MetricPromptTemplateExamples.Pairwise.SUMMARIZATION_QUALITY,
    baseline_model=baseline_model,
)

pairwise_eval_task = EvalTask(
    dataset=dataset,
    metrics=[PAIRWISE_METRIC],
    experiment="pairwise-product-summarization-quality"
)
run_ts = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

pairwise_result = pairwise_eval_task.evaluate(
    model=candidate_model,
    prompt_template="# System_prompt\n{system_prompt} # Question\n{question} # Description {description}",
    experiment_run_name=f"pairwise-prod-sumq-{run_ts}"
)

print("Summary metrics:")
print(pairwise_result.summary_metrics)
print("\nFull metrics table:")
print(pairwise_result.metrics_table)

preferred_cols = ['preferred_response', 'winner', 'chosen_model']
for col in preferred_cols:
    if col in pairwise_result.metrics_table.columns:
        print(f"\nPreferred response column ('{col}'):")
        print(pairwise_result.metrics_table[col])
        break

explanation_cols = ['explanation', 'rationale', 'choice_reason']
for col in explanation_cols:
    if col in pairwise_result.metrics_table.columns:
        print(f"\nModel's explanation column ('{col}'):")
        print(pairwise_result.metrics_table[col])
        break


INFO:vertexai.evaluation.eval_task:Logging Eval Experiment metadata: {'prompt_template': '# System_prompt\n{system_prompt} # Question\n{question} # Description {description}', 'model_name': 'publishers/google/models/gemini-2.0-flash'}
INFO:vertexai.evaluation._evaluation:Assembling prompts from the `prompt_template`. The `prompt` column in the `EvalResult.metrics_table` has the assembled prompts used for model response generation.
INFO:vertexai.evaluation._evaluation:Generating a total of 2 responses from Gemini model gemini-2.0-flash.
100%|██████████| 2/2 [00:00<00:00,  4.35it/s]
INFO:vertexai.evaluation._evaluation:All 2 responses are successfully generated from Gemini model gemini-2.0-flash.
INFO:vertexai.evaluation._evaluation:Multithreaded Batch Inference took: 0.47113623099994584 seconds.
INFO:vertexai.evaluation._evaluation:Generating a total of 2 responses from Gemini model gemini-2.0-flash-lite.
100%|██████████| 2/2 [00:00<00:00,  4.79it/s]
INFO:vertexai.evaluation._evaluation

Summary metrics:
{'row_count': 2, 'summarization_quality/candidate_model_win_rate': np.float64(0.0), 'summarization_quality/baseline_model_win_rate': np.float64(1.0)}

Full metrics table:
                                                  system_prompt  \
0  You are a helpful assistant that summarizes product reviews.   
1           You are an expert summarizer for customer feedback.   

                                                                             question  \
0  Summarize the following product review: 'Great camera but battery drains quickly.'   
1         Summarize this customer comment: 'Loved the speed, disliked the interface.'   

                                                                      description  \
0  The summary should be concise and cover the main positive and negative points.   
1                       Generate a short, clear summary for internal team review.   

                                                                                      