In [None]:
PROJECT_ID = "turan-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "supernovadetection" # @param {type:"string"}
# Make sure that dataset is created in Big Query
DATASET_ID = "spacehack" # @param {type:"string"}
import vertexai
from google.cloud import bigquery


vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)

In [None]:
import gdown 
import base64
import json
import random, os
from collections import OrderedDict
import time, datetime
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from vertexai.generative_models import GenerativeModel, Part, FinishReason, Image
from google.cloud import bigquery
import google.cloud.aiplatform as aiplatform

from helper_functions import batch_data_create, build_run_batch, if_tbl_exists, create_ex, save_picture, save_prompt, build_experiment_vars, create_batch_prediction_job, write_request

In [None]:
PERSONA = """<PERSONA>
You are an experienced astrophysicist tasked with evaluating the accuracy and coherence of astronomical classifications generated by a previous model. Your expertise ensures reliable judgments on how well the output aligns with the given astronomical images.
</PERSONA>"""

TASK = """<TASK>
Your task is to assess the coherence between the provided three images (New, Reference, and Difference) and the classification and description generated by a previous model. Additionally, you will verify if the assigned interest score is appropriate based on the description and images.
</TASK>\n
"""

INSTRUCTIONS = """<INSTRUCTIONS>
**1. Coherence Evaluation**
- Review the classification and description given by the previous model.
- Judge how well the model’s output matches the observed features in the images.
- Assign a coherence score from 0 to 5:
  - **5** - Perfectly coherent
  - **4** - Almost entirely correct
  - **3** - Mostly correct with some errors
  - **2** - More incorrect than correct
  - **1** - Majority incorrect
  - **0** - Complete hallucination

**2. Interest Score Validation**
- Determine if the interest score given by the model is validated with the images and description.
- Respond with a simple **Yes** (validated) or **No** (invalidated).
</INSTRUCTIONS>"""


METHOD = """<METHOD>
1. Examine the images and the model’s classification and description.
2. Judge coherence, assign a score (0-5), and note any major discrepancies.
3. Validate if the interest score is consistent with the description and images, responding with Yes or No.
</METHOD>
"""

# Collapse the System Instructions into a single variable
stat_prompt = PERSONA + TASK + INSTRUCTIONS + METHOD


In [None]:
# Load the dataset of images
file_path_data = 'data/new_data.npy'
file_path_labels_csv = 'data/new_labels.csv'
predictions_file = 'data/MeerLICHT_predictions.csv'

# Load image triplets (New, Reference, Difference)
triplets = np.load(file_path_data)

# Load labels and predictions from CSV files
labels_df = pd.read_csv(file_path_labels_csv)
# Download the predictions file then read it
id =  "1e4TNV5evBGdnerB0K1wOIpFGHvpIWZJL" #id from the Google Drive
gdown.download(id=id, output = predictions_file)
predictions_df = pd.read_csv(predictions_file)

# Sample indexes for saving example images
# sample_indexes = [0, 1, 3, 4, 8, 48, 77, 1179, 1180, 1181, 1191, 1193, 592, 685, 3216]
# New examples: 1 TN, 1TP, 2FP, 2FN
sample_indexes = [2259, 294, 1088 , 2065, 631, 448]
for i in sample_indexes:
    save_picture(triplets, i, True)

valid_indexes = np.where(~np.isnan(triplets).any(axis=(1, 2, 3)))[0]

In [None]:
# Updated descriptions for the new task format
def valid_example_gen(index_no):
  return OrderedDict({
    "Actual": predictions_df.actual[predictions_df.index_no==index_no].iloc[0],
    "Prediction": predictions_df.predicted[predictions_df.index_no==index_no].iloc[0],
    "Prediction_Explanation": predictions_df.explanation[predictions_df.index_no==index_no].iloc[0],
    "Other_LLM_Interest_score": predictions_df.interest_score[predictions_df.index_no==index_no].iloc[0]    
  })
## DESCRIPTION INDEX 2259:
desc1_old = valid_example_gen(2259)
desc1_new = {"coherence_score": 5, "interest_score_validated": "Yes"}


## DESCRIPTION INDEX 294
desc2_old = valid_example_gen(294)
desc2_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 1088:
desc3_old = valid_example_gen(1088)
desc3_new = {"coherence_score": 3, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 2065:
desc4_old = valid_example_gen(2065)
desc4_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 631:
desc5_old = valid_example_gen(631)
desc5_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 448:
desc6_old = valid_example_gen(448)
desc6_new = {"coherence_score": 4, "interest_score_validated": "Yes"}

descriptions = [
    desc1_old, desc1_new, desc2_old, desc2_new, desc3_old, desc3_new, desc4_old, desc4_new, desc5_old, desc5_new, desc6_old, desc6_new,   
]
### Write the examples used in a readable format to be saved as a txt file for tracebility
example_description = list(zip(["DESCRIPTION INDEX: " + str(x) for x in sample_indexes], descriptions))

In [None]:
# Report 6 examples for the dynamic prompt
EXAMPLES = ["<EXAMPLES>\n"]
for i in range(len(sample_indexes)):
    
    str_EX = f"""Example {i+1}:
    """
    all_list = create_ex(sample_indexes[i], True)
    all_list.insert(0, str_EX)
    all_list.append(str(dict(descriptions[2*i])))
    all_list.append(str(dict(descriptions[2*i+1])))
    
    for k in all_list:
        EXAMPLES.append(k)
EXAMPLES.append("\n</EXAMPLES>\n")

In [None]:
# Start logging the experiment

## Prepare the variables
timestamp = datetime.datetime.now()
formatted_datetime = timestamp.strftime('%Y%m%d%H%M')


## Log the experiments variables
### Create the run name with timestamp
run_name = "run" + formatted_datetime
DESCRIPTION = """LLM as a judge run
""" # @param {type:"string"}
MODEL = "gemini-1.5-pro-002" # @param [gemini-1.5-pro-001", "gemini-1.5-flash-001", "gemini-1.0-pro-002"]
TEMPERATURE = 0.1 # @param {type:"slider", min:0, max:2, step:0.1}
TOP_P = 0.5 # @param {type:"slider", min:0, max:1, step:0.05}
PROMPT_FILE = save_prompt(stat_prompt + '\n'.join([a + "\n" + str(b) + "\n" for (a,b) in example_description]), run_name)

# Build the experimentation variables
exp_vars = build_experiment_vars(description=DESCRIPTION,prompt=PROMPT_FILE, model=MODEL, temperature=TEMPERATURE, top_p=TOP_P)
# Start the run
aiplatform.start_run(run_name)
# Log the experiment variables
aiplatform.log_params(exp_vars)

In [None]:
# Construct table names
input_table_name = f'{PROJECT_ID}.{DATASET_ID}.input{run_name}'
output_table_name = f'{PROJECT_ID}.{DATASET_ID}.output{run_name}'

# Define the table schema
schema = [
    bigquery.SchemaField('request', 'JSON'),
    bigquery.SchemaField('index_no', 'INTEGER')
]

# Create the table if it doesnt exist
table = bigquery.Table(input_table_name, schema=schema)
if_tbl_exists(bq_client, table)

# Create the pandas df that stores the requests
batch_df = pd.DataFrame(columns=["request", "index_no"])

for t in list(predictions_df.index_no):
    dyna_prompt = EXAMPLES + create_ex(t, False) + [str(dict(valid_example_gen(t)))]
    df_temp = pd.DataFrame([[batch_data_create(stat_prompt, dyna_prompt, TEMPERATURE, TOP_P), t]], columns=["request", "index_no"])
    batch_df = pd.concat([batch_df, df_temp], ignore_index=True)

job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")
job_config.source_format = 'CSV'

job = bq_client.load_table_from_dataframe(
    batch_df, input_table_name, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

# Generate the request.json for batch processing
write_request("spacehackbatch_check", MODEL, "bq://" + input_table_name,
            "bq://" + output_table_name)

# Send the batch response
response = create_batch_prediction_job(PROJECT_ID, "request.json")
# Run the batch process job and wait for completion.
job = aiplatform.BatchPredictionJob(response["name"].split("/")[-1])
job.wait_for_completion()


In [None]:
# The query to generate a final table with results
create_table_query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.{run_name}` AS
SELECT  t1.index_no, 
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.coherence_score') AS coherence_score,
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.interest_score_validated') AS interest_score_coherent,
t1.response, t1.request 
        FROM `{output_table_name}` as t1
"""
# Run the query
query_job = bq_client.query(create_table_query)
results = query_job.result()
# Download the results to generate KPIs
download_query = f"""
SELECT index_no, coherence_score, interest_score_coherent
FROM {PROJECT_ID}.{DATASET_ID}.{run_name} 
"""
pred_df = bq_client.query_and_wait(download_query).to_dataframe()


In [None]:
pred_df.merge(predictions_df, on="index_no").to_csv("MeerLICHT_predictions.csv", index=False)