In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# LLM Judging LLM Classifications

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/turanbulmus/spacehack/blob/main/02 - LLM Judging LLM Classifications.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fturanbulmus%2Fspacehack%2Fmain%2F02%20-%20LLM%20LLM%20Judging%20LLM%20Classifications.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/turanbulmus/spacekhack/main/02%20-%20LLM%20LLM%20Judging%20LLM%20Classifications.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/turanbulmus/spacehack/blob/main/02%20-%20LLM%20LLM%20Judging%20LLM%20Classifications.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| | |
|----------|-------------|
| Authors   | Turan Bulmus,  Fiorenzo Stoppa|
| Last updated | 2025 03 20: Final Publication |

# Overview

This notebook builds on top of the previous notebook and judges the results of the run obtained in the previous notebook. For each observeration, an LLM "referee" looks at the generated results and "judges" if the generated content is correct or not. By the end of this notebook, you will gain insights into:
* How one can uses LLM to evaluate the generated results by an another LLM Model
* Generate human readable evaluation metrics to be used by LLM to judge the results of an another LLM

This notebook demonstrates these with the following steps:

1. **Import Libraries and Build Variables:** Similar to the previous notebook, we start by importing the necessary libraries for image loading, processing and conduct the LLM calls. 
2. **Build System Instructions for the Prompt:** A well-crafted prompt is crucial for guiding Gemini 1.5 Pro towards the desired output. We carefully define instructions outlining the task, evaluation criteria and the method to do so.
3. **Load the Dataset:** We load the generated content dervied from the previous notebook and use it for analysis.
4. **Run Gemini 1.5 Pro with the Prompt:** We execute the prompt with Gemini 1.5 Pro, iterating over 100 samples from the dataset to get classifications for each image.

## Using This Notebook

Colab is recommended for running this notebook, but it can run in any iPython environment where you can connect to Google Cloud, install pip packages, etc.

If you're running outside of Colab, depending on your environment you may need to install pip packages (at the very least `pandas` and `tabulate`) that are included in the Colab environment by default but are not part of the Python Standard Library--try pipping the library name of any imports that fail. You'll also notice some comments in code cells that look like "@something"; these have special rendering in colab, but you aren't missing out on any content or important functionality.

This notebook uses the following Google Cloud services and resources:

* [Vertex AI Gemini Models](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models)
* [Vertex AI Experiments](https://cloud.google.com/vertex-ai/docs/experiments/intro-vertex-ai-experiments)
* [Google Cloud BigQuery](https://cloud.google.com/bigquery/docs/introduction)

This notebook has been tested in the following environment:

* Python version = 3.12.3
* [google-cloud-aiplatform](https://pypi.org/project/google-cloud-aiplatform/) version = 1.70.0

## Useful Tips

1. This notebook uses Generative AI cababilities. Re-running a cell that uses Generative AI capabilities may produce similar but not identical results.
2. Please note that running this notebook before running ```01 - LLM Classification Transients.ipynb``` is only possible if you have the following datasets:
    * ```data/predictions_results.csv```: The predictions generated with LLMs following instructions in the notebook: ```01 - LLM Classification Transients.ipynb```
    * ```data/new_data.npy```: The raw images file. 
    * ```data/new_labels.csv```: The labels associated with the raw images.

TODO: Add tips regarding: Quotas, possible errors (429 etc..)

# 0. Getting Started

## Install required packages and restart runtime 
Install the required packages for code to run. These packages are included in the requirements.txt file.

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

The code assumes that the packages are already installed hence the following code is commented out. If you are running this notebook for the first time, please uncomment the next block and run the code

In [None]:
# ! pip install -r requirements.txt --quiet

# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

## Authenticate your notebook environment

If you're using Colab, run the code in the next cell. Follow the popups and authenticate with an account that has access to your Google Cloud [project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#identifying_projects).

If you're running this notebook somewhere besides Colab, make sure your environment has the right Google Cloud access. If that's a new concept to you, consider looking into [Application Default Credentials for your local environment](https://cloud.google.com/docs/authentication/provide-credentials-adc#local-dev) and [initializing the Google Cloud CLI](https://cloud.google.com/docs/authentication/gcloud). In many cases, running `gcloud auth application-default login` in a shell on the machine running the notebook kernel is sufficient.

More authentication options are discussed [here](https://cloud.google.com/docs/authentication).

In [None]:
# Colab authentication
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()
    print('Authenticated')

## Set Google Cloud environment variables information and initialize Clients

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).
To get started using Vertex AI, you must have an existing Google Cloud project and the following:
* [Enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com). 
* [Enable the Big Query API](https://console.cloud.google.com/flows/enableapi?apiid=bigquery.googleapis.com).

The following variables will be used throughout the code below. Please find the meaning of each variable in the documentation below:

* [PROJECT_ID](https://cloud.google.com/resource-manager/docs/creating-managing-projects): The ID of the GCP project which will host the 
* [LOCATION](https://cloud.google.com/about/locations): The location where the computational resources will be run. Please also check the availability of Gemini models in the associated regions from [here](https://cloud.google.com/vertex-ai/generative-ai/docs/learn/locations).
* [EXPERIMENT_NAME](https://cloud.google.com/vertex-ai/docs/experiments/create-manage-exp-run): The name of the Vertex AI experiment. This will ensure tracebility of the experiement in the Vertex AI Environment
* [DATASET_ID](https://cloud.google.com/bigquery/docs/datasets-intro): The code will run a batch process for various LLM calls. The results and the input of these runs will be stored at a BigQuery table. The instructions to create a dataset can be found [here](https://cloud.google.com/bigquery/docs/datasets).

In [1]:
PROJECT_ID = "<YOUR_PROJECT_ID>"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "supernovadetection" # @param {type:"string"}
# Make sure that dataset is created in Big Query
DATASET_ID = "spacehack" # @param {type:"string"}

import vertexai
from google.cloud import bigquery

# Initiate Vertex AI and BigQuery clients
vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)

## Import libraries and define helper functions

Import the necessary libraries that will be used throughout the code. Please note that the last import (i.e. helper_functions) is coming originated from the source code in the repository.

In [41]:
import os, datetime
from collections import OrderedDict

import pandas as pd
import numpy as np

from google.cloud import bigquery
import google.cloud.aiplatform as aiplatform

from helper_functions import write_request, create_batch_prediction_job, batch_data_create, if_tbl_exists, create_ex, save_picture, save_prompt, build_experiment_vars

# 1. Build System Instructions and The Prompt

In [3]:
PERSONA = """<PERSONA>
You are an experienced astrophysicist tasked with evaluating the accuracy and coherence of astronomical classifications generated by a previous model. Your expertise ensures reliable judgments on how well the output aligns with the given astronomical images.
</PERSONA>"""

TASK = """<TASK>
Your task is to assess the coherence between the provided three images (New, Reference, and Difference) and the classification and description generated by a previous model. Additionally, you will verify if the assigned interest score is appropriate based on the description and images.
</TASK>\n
"""

INSTRUCTIONS = """<INSTRUCTIONS>
**1. Coherence Evaluation**
- Review the classification and description given by the previous model.
- Judge how well the model’s output matches the observed features in the images.
- Assign a coherence score from 0 to 5:
  - **5** - Perfectly coherent
  - **4** - Almost entirely correct
  - **3** - Mostly correct with some errors
  - **2** - More incorrect than correct
  - **1** - Majority incorrect
  - **0** - Complete hallucination

**2. Interest Score Validation**
- Determine if the interest score given by the model is validated with the images and description.
- Respond with a simple **Yes** (validated) or **No** (invalidated).
</INSTRUCTIONS>"""


METHOD = """<METHOD>
1. Examine the images and the model’s classification and description.
2. Judge coherence, assign a score (0-5), and note any major discrepancies.
3. Validate if the interest score is consistent with the description and images, responding with Yes or No.
</METHOD>
"""

# Collapse the System Instructions into a single variable
stat_prompt = PERSONA + TASK + INSTRUCTIONS + METHOD


In [29]:
# Load the dataset of images
file_path_data = 'data/new_data.npy'
file_path_labels_csv = 'data/new_labels.csv'
predictions_file = 'data/predictions_results.csv'

# Load image triplets (New, Reference, Difference)
triplets = np.load(file_path_data)

# Load labels and predictions from CSV files
labels_df = pd.read_csv(file_path_labels_csv)
predictions_df = pd.read_csv(predictions_file)

# New examples: 1 TN, 1TP, 2FP, 2FN
sample_indexes = [2065, 294, 454 , 216, 2284, 2055]
for i in sample_indexes:
    save_picture(triplets, i, True)

valid_indexes = np.where(~np.isnan(triplets).any(axis=(1, 2, 3)))[0]

In [35]:
# Updated descriptions for the new task format
def valid_example_gen(index_no):
  return OrderedDict({
    "Actual": predictions_df.label[predictions_df.index_no==index_no].iloc[0],
    "Prediction": predictions_df.predicted[predictions_df.index_no==index_no].iloc[0],
    "Prediction_Explanation": predictions_df.explanation[predictions_df.index_no==index_no].iloc[0],
    "Other_LLM_Interest_score": predictions_df.interest_score[predictions_df.index_no==index_no].iloc[0]    
  })
## DESCRIPTION INDEX 181:
desc1_old = valid_example_gen(181)
desc1_new = {"coherence_score": 1, "interest_score_validated": "Yes"}


## DESCRIPTION INDEX 294
desc2_old = valid_example_gen(294)
desc2_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 454:
desc3_old = valid_example_gen(454)
desc3_new = {"coherence_score": 0, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 2065:
desc4_old = valid_example_gen(2065)
desc4_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 216:
desc5_old = valid_example_gen(216)
desc5_new = {"coherence_score": 2, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 448:
desc6_old = valid_example_gen(448)
desc6_new = {"coherence_score": 4, "interest_score_validated": "Yes"}

descriptions = [
    desc1_old, desc1_new, desc2_old, desc2_new, desc3_old, desc3_new, desc4_old, desc4_new, desc5_old, desc5_new, desc6_old, desc6_new,   
]
### Write the examples used in a readable format to be saved as a txt file for tracebility
example_description = list(zip(["DESCRIPTION INDEX: " + str(x) for x in sample_indexes], descriptions))

In [36]:
# Report 6 examples for the dynamic prompt
EXAMPLES = ["<EXAMPLES>\n"]
for i in range(len(sample_indexes)):
    
    str_EX = f"""Example {i+1}:
    """
    all_list = create_ex(sample_indexes[i], True)
    all_list.insert(0, str_EX)
    all_list.append(str(dict(descriptions[2*i])))
    all_list.append(str(dict(descriptions[2*i+1])))
    
    for k in all_list:
        EXAMPLES.append(k)
EXAMPLES.append("\n</EXAMPLES>\n")

In [37]:
# Start logging the experiment

## Prepare the variables
timestamp = datetime.datetime.now()
formatted_datetime = timestamp.strftime('%Y%m%d%H%M')


## Log the experiments variables
### Create the run name with timestamp
run_name = "run" + formatted_datetime
DESCRIPTION = """LLM as a judge run second try
""" # @param {type:"string"}
MODEL = "gemini-1.5-pro-002" # @param [gemini-1.5-pro-001", "gemini-1.5-flash-001", "gemini-1.0-pro-002"]
TEMPERATURE = 0.1 # @param {type:"slider", min:0, max:2, step:0.1}
TOP_P = 0.5 # @param {type:"slider", min:0, max:1, step:0.05}
PROMPT_FILE = save_prompt(stat_prompt + '\n'.join([a + "\n" + str(b) + "\n" for (a,b) in example_description]), run_name)

# Build the experimentation variables
exp_vars = build_experiment_vars(description=DESCRIPTION,prompt=PROMPT_FILE, model=MODEL, temperature=TEMPERATURE, top_p=TOP_P)
# Start the run
aiplatform.start_run(run_name)
# Log the experiment variables
aiplatform.log_params(exp_vars)

Associating projects/355771430623/locations/us-central1/metadataStores/default/contexts/supernovadetection-run202503210617 to Experiment: supernovadetection


In [42]:
# Construct table names
input_table_name = f'{PROJECT_ID}.{DATASET_ID}.input{run_name}'
output_table_name = f'{PROJECT_ID}.{DATASET_ID}.output{run_name}'

# Define the table schema
schema = [
    bigquery.SchemaField('request', 'JSON'),
    bigquery.SchemaField('index_no', 'INTEGER')
]

# Create the table if it doesnt exist
table = bigquery.Table(input_table_name, schema=schema)
if_tbl_exists(bq_client, table)

# Create the pandas df that stores the requests
batch_df = pd.DataFrame(columns=["request", "index_no"])

for t in list(predictions_df.index_no):
    dyna_prompt = EXAMPLES + create_ex(t, False) + [str(dict(valid_example_gen(t)))]
    df_temp = pd.DataFrame([[batch_data_create(stat_prompt, dyna_prompt, TEMPERATURE, TOP_P), t]], columns=["request", "index_no"])
    batch_df = pd.concat([batch_df, df_temp], ignore_index=True)

job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")
job_config.source_format = 'CSV'

job = bq_client.load_table_from_dataframe(
    batch_df, input_table_name, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

# Generate the request.json for batch processing
write_request("spacehackbatch_check", MODEL, "bq://" + input_table_name,
            "bq://" + output_table_name)

# Send the batch response
response = create_batch_prediction_job(PROJECT_ID, "request.json")
# Run the batch process job and wait for completion.
job = aiplatform.BatchPredictionJob(response["name"].split("/")[-1])
job.wait_for_completion()


BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688 current state:
4
BatchPredictionJob run completed. Resource name: projects/355771430623/locations/us-central1/batchPredictionJobs/8905182734805106688


In [45]:
##The query to generate a final table with results
create_table_query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.{run_name}` AS
SELECT  t1.index_no, 
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.coherence_score') AS coherence_score,
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.interest_score_validated') AS interest_score_coherent,
t1.response, t1.request 
        FROM `{output_table_name}` as t1
"""
#Run the query
query_job = bq_client.query(create_table_query)
results = query_job.result()
# Download the results to generate KPIs
download_query = f"""
SELECT index_no, coherence_score, interest_score_coherent
FROM {PROJECT_ID}.{DATASET_ID}.{run_name}
"""
pred_df = bq_client.query_and_wait(download_query).to_dataframe()
pred_df.to_csv("data/predictions_with_Coherence.csv")

TODO: Further analysis on coherence_score and interest_score

# Clean up

In [None]:
# Clean up the resources. 

# Delete the BigQeury Tables
bq_client.delete_table(f"{PROJECT_ID}.{DATASET_ID}.input{run_name}")
bq_client.delete_table(f"{PROJECT_ID}.{DATASET_ID}.output{run_name}")
bq_client.delete_table(f"{PROJECT_ID}.{DATASET_ID}.{run_name}")

# Delte the request.json file
os.remove("request.json")


In [13]:
pred_df

Unnamed: 0,index_no,coherence_score,interest_score_coherent
0,817,5,Yes
1,750,5,Yes
2,553,5,Yes
3,101,5,Yes
4,859,5,Yes
...,...,...,...
3214,800,5,Yes
3215,2466,5,Yes
3216,1129,5,Yes
3217,3071,5,Yes
