In [1]:
PROJECT_ID = "turan-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "supernovadetection" # @param {type:"string"}
# Make sure that dataset is created in Big Query
DATASET_ID = "spacehack" # @param {type:"string"}
import vertexai
from google.cloud import bigquery


vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)





In [2]:
import gdown 
import base64
import json
import random, os
from collections import OrderedDict
import time, datetime
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from vertexai.generative_models import GenerativeModel, Part, FinishReason, Image
from google.cloud import bigquery
import google.cloud.aiplatform as aiplatform

from helper_functions import batch_data_create, build_run_batch, if_tbl_exists, create_ex, save_picture, save_prompt, build_experiment_vars, create_batch_prediction_job, write_request

In [3]:
# PERSONA = """<PERSONA>
# You are an experienced astrophysicist tasked with evaluating the accuracy and coherence of astronomical classifications generated by a previous model. Your expertise ensures reliable judgments on how well the output aligns with the given astronomical images.
# </PERSONA>"""

# TASK = """<TASK>
# Your task is to assess the coherence between the provided three images (New, Reference, and Difference) and the classification and description generated by a previous model. Additionally, you will verify if the assigned interest score is appropriate based on the description and images.
# </TASK>\n
# """
# INSTRUCTIONS = """<INSTRUCTIONS>
# **1. Coherence Evaluation**
# - **Review Process:**
#   - Examine the classification and description provided by the previous model.
#   - Analyze how accurately the model’s output reflects the features observed in the New, Reference, and Difference images.

# - **Scoring Criteria:**
#   - **5 – Completely Coherent and Accurate**
#     - The classification and description perfectly match all relevant features in the images.
#     - No discrepancies or inaccuracies are present.
#     - The explanation is thorough and leaves no room for doubt regarding its correctness.

#   - **4 – Largely Correct with Minor Discrepancies**
#     - The classification and description are mostly accurate and align well with the images.
#     - Minor errors or omissions exist but do not significantly detract from the overall accuracy.
#     - These discrepancies are typically small details that do not alter the fundamental understanding.

#   - **3 – Generally Correct but with Notable Errors**
#     - The classification and description capture the primary features but include several significant inaccuracies.
#     - These errors may lead to a misunderstanding of key aspects depicted in the images.
#     - The overall description remains somewhat reliable but requires corrections for precision.

#   - **2 – More Inaccuracies Than Correct Elements**
#     - The classification and description contain numerous errors that overshadow the correct information.
#     - Critical features are misrepresented, leading to a largely incorrect understanding of the images.
#     - The output may include irrelevant or misleading information not supported by the images.

#   - **1 – Predominantly Incorrect**
#     - The classification and description are mostly inaccurate with very few correct elements.
#     - There is a fundamental misunderstanding of the features presented in the images.
#     - The output fails to convey the essential characteristics necessary for accurate classification.

#   - **0 – Entirely Fabricated or Unrelated**
#     - The classification and description have no basis in the provided images.
#     - The output is completely irrelevant, containing information that does not pertain to the images.
#     - It may include fabricated details with no observable support from the visual data.

# **2. Interest Score Validation**
# - **Evaluation Process:**
#   - Assess whether the interest score assigned by the model is justified based on the description and the provided images.
#   - **Self-Consistency Check:** Ensure that the interest score is coherent with the model’s own description, even in cases where the description may contain inaccuracies. This means evaluating if the model consistently assigns an appropriate interest score relative to the content and significance it describes, maintaining internal consistency.

# - **Response Format:**
#   - Respond with a clear **Yes** (validated) or **No** (invalidated).
# </INSTRUCTIONS>"""

# METHOD = """<METHOD>
# 1. Examine the images and the model’s classification and description.
# 2. Judge coherence, assign a score (0-5), and note any major discrepancies.
# 3. Validate if the interest score is consistent with the description and images, responding with Yes or No.
# </METHOD>
# """

# # Collapse the System Instructions into a single variable
# stat_prompt = PERSONA + TASK + INSTRUCTIONS + METHOD


In [4]:
PERSONA = """<PERSONA>
You are an experienced astrophysicist tasked with evaluating the accuracy, coherence, and adherence to task requirements of astronomical classifications generated by a previous model. Your expertise ensures reliable judgments on how well the output aligns with the given astronomical images and the original instructions for the task.
</PERSONA>"""

TASK = """<TASK>
Your task is to assess the coherence between the provided three images (New, Reference, and Difference) and the classification and description generated by a previous model. Additionally, evaluate how well the output adheres to the original instructions provided to the model. Verify if the assigned interest score is appropriate based on the description, images, and original instructions.
</TASK>"""

ORIGINAL_INSTRUCTIONS = """<ORIGINAL INSTRUCTIONS>
**Persona**  
You are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.

**Task**  
Your task is to read the INSTRUCTIONS, look at the 3 images (New, Reference, and Difference images) and classify if the source at the center of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. Respond in JSON format.

**Instructions**  
1. **Purpose**:  
   Help vet astronomical data for the Real/Bogus classification. The goal is for you to use your expertise to distinguish between real and bogus sources.

2. **Information Provided**:  
   You will be shown three astronomical image cutouts:  
   a) **New Image:** The newest image centered at the location of the suspected transient source.  
   b) **Reference Image:** A reference image from the same telescope of the same part of the sky to be used for comparison. It shows if the source was already there in the past or not.  
   c) **Difference Image:** The residual image after the new and reference images are subtracted. Real sources should appear in this cutout as circular objects with only positive (white pixels) or only negative (black pixels) flux.

3. **Criteria for Classification**:  
   - **Real Source:**  
     - **Shape:** Circular shape at the center of the cutout with a visual extent of ~5-10 pixels, varying with focus conditions.  
     - **Brightness:** Positive flux (white pixels) in either the new or reference image. Positive or negative flux in the Difference image.  
     - **Variability:** The source at the center can fade or brighten between the new and reference images, appearing as positive or negative in the Difference image.  
     - **Presence:** The source may (dis)appear between the new and reference images. A source may also appear on top of an underlying source (e.g., supernova on a galaxy).  

   - **Bogus Source:**  
     - **Shape:** Non-circular shape (e.g., elongated). This includes irregular shapes, positive or negative, like streaks or lines caused by cosmic-rays, diffraction spikes, and cross-talk.  
     - **Brightness:** Negative flux (black pixels) at the center of the cutout in either the new or reference image. The source at the center can never be negative in the New or Reference image, only in the Difference.  
     - **Misalignment:** If the source in the New and Reference images is misaligned, it will show a Yin-Yang pattern (both white and black) in the Difference image.

4. **Additional Guidance**:  
   - **Contextual Information:** Focus on the source at the center of the cutouts inside the red circle, but consider nearby sources to diagnose potential problems.  
   - **Examples:** Refer to provided visual examples of real and bogus sources to aid in identification.  
   - **Judgment Criteria:** For ambiguous cases or borderline scenarios, consider the overall context and consistency with known characteristics of real and bogus sources.

**Method**  
1. **Focus on the Red Circle**: Start by examining the source located at the center of the cutout and inside the red circle.  
2. **Analyze Each Image Individually**:  
   - **New Image**: Check for the presence, shape, and brightness of the source in the new image.  
   - **Reference Image**: Compare the source's properties in the reference image to those in the new image.  
   - **Difference Image**: Observe the residuals that result from subtracting the reference image from the new image. Look for patterns (circular, positive/negative flux) that match characteristics of Real or Bogus sources.  
3. **Evaluate Features**:  
   - Examine the shape, brightness, and other relevant features (e.g., artifacts, misalignments) of the source in each image.  
   - Determine if these features are consistent with a Real or Bogus classification based on the criteria provided in the instructions.  
4. **Consider Relationships Between Images**:  
   - Compare the new, reference, and difference images to understand any changes in the source over time.  
   - Look for discrepancies or confirmations that might support or contradict a particular classification.  
5. **Employ a Chain-of-Thought Reasoning**:  
   - Clearly outline each observation you make and explain how it contributes to your decision-making process.  
   - If you find any contradictions or ambiguous features, acknowledge them and provide reasoning for your final decision.  
6. **Assign an Interest Score**:  
   - After determining if the source is Real or Bogus, assign an appropriate interest score:  
     - 'No interest' for Bogus sources.  
     - 'Low interest' for variable transients.  
     - 'High interest' for explosive transients.  
7. **Prepare the Final Output in JSON Format**:  
   - Format your response as a JSON object containing:  
     - The classification ('Real' or 'Bogus').  
     - An explanation detailing your thought process and observations.  
     - The assigned interest score.  

8. **Example Output**:  
   - Refer to the provided examples to see the expected format and detail level of your response.
</ORIGINAL INSTRUCTIONS>"""

INSTRUCTIONS = """<INSTRUCTIONS>
**1. Coherence and Adherence Evaluation**  
- **Review Process**:  
  - Examine the classification and description provided by the previous model.  
  - Analyze how accurately the model’s output reflects the features observed in the New, Reference, and Difference images.  
  - Compare the output against the original instructions to assess adherence to task requirements.  

- **Scoring Criteria**:  
  - **5 – Completely Coherent, Accurate, and Adherent**  
    - The classification and description perfectly match all relevant features in the images and fully adhere to the original instructions.  
  - **4 – Largely Correct with Minor Issues**  
    - The output is mostly accurate and aligns well with both the images and instructions, with only minor errors or deviations.  
  - **3 – Generally Correct but with Notable Errors**  
    - The output captures primary features but includes significant inaccuracies or partial adherence to the instructions.  
  - **2 – More Inaccuracies Than Correct Elements**  
    - The output has numerous errors, misrepresenting key features and deviating from the instructions.  
  - **1 – Predominantly Incorrect**  
    - The output is mostly inaccurate and fails to follow the instructions in key areas.  
  - **0 – Entirely Fabricated or Unrelated**  
    - The output has no basis in the images or instructions.  

**2. Interest Score Validation**  
- Validate whether the interest score is consistent with both the description and the images, as well as the original instructions.

**3. Response Format**  
- Provide a JSON response with:  
  - **Coherence and adherence score (0-5).**  
  - **Validation of the interest score (Yes/No).**  
  - **Detailed reasoning** for the given score and validation decision.
</INSTRUCTIONS>"""

METHOD = """<METHOD>
1. Examine the images and the model’s classification and description.  
2. Compare the output against the original instructions for adherence.  
3. Judge coherence, assign a score (0-5), and note any major discrepancies.  
4. Validate if the interest score is consistent with the description, images, and instructions, responding with Yes or No.  
5. Provide detailed reasoning for all evaluations.
</METHOD>"""

stat_prompt = PERSONA + TASK + ORIGINAL_INSTRUCTIONS + INSTRUCTIONS + METHOD


In [5]:
# Load the dataset of images
file_path_data = 'MeerLICHT_dataset/MeerLICHT_images.npy'
file_path_labels_csv = 'MeerLICHT_dataset/MeerLICHT_labels.csv'
predictions_file = 'MeerLICHT_predictions.csv'

# Load image triplets (New, Reference, Difference)
triplets = np.load(file_path_data)

# Load labels and predictions from CSV files
labels_df = pd.read_csv(file_path_labels_csv)
# Download the predictions file then read it
id =  "1e4TNV5evBGdnerB0K1wOIpFGHvpIWZJL" #id from the Google Drive
gdown.download(id=id, output = predictions_file)
predictions_df = pd.read_csv(predictions_file)

# Sample indexes for saving example images
# New examples: 1 TN, 1TP, 2FP, 2FN
sample_indexes = [181, 294, 454 , 2065, 216, 448,624]
# for i in sample_indexes:
#     save_picture(triplets, i, True)

valid_indexes = np.where(~np.isnan(triplets).any(axis=(1, 2, 3)))[0]
# for t in valid_indexes:
#     save_picture(triplets, t, False)

Downloading...
From: https://drive.google.com/uc?id=1e4TNV5evBGdnerB0K1wOIpFGHvpIWZJL
To: /home/user/spacehack/MeerLICHT_predictions.csv
100%|██████████| 1.04M/1.04M [00:00<00:00, 92.9MB/s]


In [6]:
# Updated descriptions for the new task format
def valid_example_gen(index_no):
  return OrderedDict({
    "Actual": predictions_df.actual[predictions_df.index_no==index_no].iloc[0],
    "Prediction": predictions_df.predicted[predictions_df.index_no==index_no].iloc[0],
    "Prediction_Explanation": predictions_df.explanation[predictions_df.index_no==index_no].iloc[0],
    "Other_LLM_Interest_score": predictions_df.interest_score[predictions_df.index_no==index_no].iloc[0]    
  })

## DESCRIPTION INDEX 181:
desc1_old = valid_example_gen(181)
desc1_new = {"coherence_score": 2, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 294
desc2_old = valid_example_gen(294)
desc2_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 454:
desc3_old = valid_example_gen(454)
desc3_new = {"coherence_score": 1, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 2065:
desc4_old = valid_example_gen(2065)
desc4_new = {"coherence_score": 5, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 216:
desc5_old = valid_example_gen(216)
desc5_new = {"coherence_score": 2, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 448:
desc6_old = valid_example_gen(448)
desc6_new = {"coherence_score": 4, "interest_score_validated": "Yes"}

## DESCRIPTION INDEX 624:
desc7_old = valid_example_gen(624)
desc7_new = {"coherence_score": 3, "interest_score_validated": "Yes"}

descriptions = [
    desc1_old, desc1_new, desc2_old, desc2_new, desc3_old, desc3_new, desc4_old, desc4_new, desc5_old, desc5_new, desc6_old, desc6_new,  desc7_old, desc7_new, 
]
### Write the examples used in a readable format to be saved as a txt file for tracebility
example_description = list(zip(["DESCRIPTION INDEX: " + str(x) for x in sample_indexes], descriptions))

In [7]:
# Report 6 examples for the dynamic prompt
EXAMPLES = ["<EXAMPLES>\n"]
for i in range(len(sample_indexes)):
    
    str_EX = f"""Example {i+1}:
    """
    all_list = create_ex(sample_indexes[i], True)
    all_list.insert(0, str_EX)
    all_list.append(str(dict(descriptions[2*i])))
    all_list.append(str(dict(descriptions[2*i+1])))
    
    for k in all_list:
        EXAMPLES.append(k)
EXAMPLES.append("\n</EXAMPLES>\n")

In [8]:
# Start logging the experiment

## Prepare the variables
timestamp = datetime.datetime.now()
formatted_datetime = timestamp.strftime('%Y%m%d%H%M')


## Log the experiments variables
### Create the run name with timestamp
run_name = "run" + formatted_datetime
DESCRIPTION = """LLM as a judge run second try
""" # @param {type:"string"}
MODEL = "gemini-1.5-pro-002" # @param [gemini-1.5-pro-001", "gemini-1.5-flash-001", "gemini-1.0-pro-002"]
TEMPERATURE = 0.1 # @param {type:"slider", min:0, max:2, step:0.1}
TOP_P = 0.5 # @param {type:"slider", min:0, max:1, step:0.05}
PROMPT_FILE = save_prompt(stat_prompt + '\n'.join([a + "\n" + str(b) + "\n" for (a,b) in example_description]), run_name)

# Build the experimentation variables
exp_vars = build_experiment_vars(description=DESCRIPTION,prompt=PROMPT_FILE, model=MODEL, temperature=TEMPERATURE, top_p=TOP_P)
# Start the run
aiplatform.start_run(run_name)
# Log the experiment variables
aiplatform.log_params(exp_vars)

Associating projects/355771430623/locations/us-central1/metadataStores/default/contexts/supernovadetection-run202412150005 to Experiment: supernovadetection


In [9]:
# Construct table names
input_table_name = f'{PROJECT_ID}.{DATASET_ID}.input{run_name}'
output_table_name = f'{PROJECT_ID}.{DATASET_ID}.output{run_name}'

# Define the table schema
schema = [
    bigquery.SchemaField('request', 'JSON'),
    bigquery.SchemaField('index_no', 'INTEGER')
]

# Create the table if it doesnt exist
table = bigquery.Table(input_table_name, schema=schema)
if_tbl_exists(bq_client, table)

# Create the pandas df that stores the requests
batch_df = pd.DataFrame(columns=["request", "index_no"])

for t in list(predictions_df.index_no):
    dyna_prompt = EXAMPLES + create_ex(t, False) + [str(dict(valid_example_gen(t)))]
    df_temp = pd.DataFrame([[batch_data_create(stat_prompt, dyna_prompt, TEMPERATURE, TOP_P), t]], columns=["request", "index_no"])
    batch_df = pd.concat([batch_df, df_temp], ignore_index=True)

job_config = bigquery.LoadJobConfig(schema=schema, write_disposition="WRITE_TRUNCATE")
job_config.source_format = 'CSV'

job = bq_client.load_table_from_dataframe(
    batch_df, input_table_name, job_config=job_config
)  # Make an API request.
job.result()  # Wait for the job to complete.

# Generate the request.json for batch processing
write_request("spacehackbatch_check", MODEL, "bq://" + input_table_name,
            "bq://" + output_table_name)

# Send the batch response
response = create_batch_prediction_job(PROJECT_ID, "request.json")
# Run the batch process job and wait for completion.
job = aiplatform.BatchPredictionJob(response["name"].split("/")[-1])
job.wait_for_completion()


BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
3
BatchPredictionJob projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608 current state:
4
BatchPredictionJob run completed. Resource name: projects/355771430623/locations/us-central1/batchPredictionJobs/6366565090992324608


In [10]:
#The query to generate a final table with results
run_name = "run" + "202412150005"
output_table_name = "spacehack.outputrun202412150005"
create_table_query = f"""
CREATE OR REPLACE TABLE `{PROJECT_ID}.{DATASET_ID}.{run_name}` AS
SELECT  t1.index_no, 
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.coherence_score') AS coherence_score,
    JSON_EXTRACT_SCALAR(JSON_EXTRACT_SCALAR(response, '$.candidates[0].content.parts[0].text'), '$.interest_score_validated') AS interest_score_coherent,
t1.response, t1.request 
        FROM `{output_table_name}` as t1
"""
#Run the query
query_job = bq_client.query(create_table_query)
results = query_job.result()


In [11]:

# Download the results to generate KPIs
download_query = f"""
SELECT index_no, coherence_score, interest_score_coherent
FROM turan-genai-bb.spacehack.run202412150005
"""
pred_df = bq_client.query_and_wait(download_query).to_dataframe()

In [12]:
pred_df.to_csv("MeerLICHT_predictions_with_Coherence_updated.csv")

In [13]:
pred_df

Unnamed: 0,index_no,coherence_score,interest_score_coherent
0,817,5,Yes
1,750,5,Yes
2,553,5,Yes
3,101,5,Yes
4,859,5,Yes
...,...,...,...
3214,800,5,Yes
3215,2466,5,Yes
3216,1129,5,Yes
3217,3071,5,Yes
