In [None]:
PROJECT_ID = "turan-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "supernovadetection" # @param {type:"string"}
# Make sure that dataset is created in Big Query
DATASET_ID = "spacehack" # @param {type:"string"}
import vertexai
from google.cloud import bigquery


vertexai.init(project=PROJECT_ID, location=LOCATION, experiment=EXPERIMENT_NAME)
bq_client = bigquery.Client(project=PROJECT_ID)

In [None]:
import gdown 
import base64
import json
import random, os
import time, datetime
from pathlib import Path

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from vertexai.generative_models import GenerativeModel, Part, FinishReason, Image
from google.cloud import bigquery
import google.cloud.aiplatform as aiplatform

from helper_functions import build_run_batch, if_tbl_exists, create_ex, save_picture, save_prompt, build_experiment_vars

In [None]:
PERSONA = """<PERSONA>
You are an experienced astrophysicist tasked with evaluating the accuracy and coherence of astronomical classifications generated by a previous model. Your expertise ensures reliable judgments on how well the output aligns with the given astronomical images.
</PERSONA>"""

TASK = """<TASK>
Your task is to assess the coherence between the provided three images (New, Reference, and Difference) and the classification and description generated by a previous model. Additionally, you will verify if the assigned interest score is appropriate based on the description and images.
</TASK>\n
"""

INSTRUCTIONS = """<INSTRUCTIONS>
**1. Coherence Evaluation**
- Review the classification and description given by the previous model.
- Judge how well the model’s output matches the observed features in the images.
- Assign a coherence score from 0 to 5:
  - **5** - Perfectly coherent
  - **4** - Almost entirely correct
  - **3** - Mostly correct with some errors
  - **2** - More incorrect than correct
  - **1** - Majority incorrect
  - **0** - Complete hallucination

**2. Interest Score Validation**
- Determine if the interest score given by the model is coherent with the images and description.
- Respond with a simple **Yes** (coherent) or **No** (incoherent).
</INSTRUCTIONS>"""


METHOD = """<METHOD>
1. Examine the images and the model’s classification and description.
2. Judge coherence, assign a score (0-5), and note any major discrepancies.
3. Validate if the interest score is consistent with the description and images, responding with Yes or No.
</METHOD>
"""

# Collapse the System Instructions into a single variable
stat_prompt = PERSONA + TASK + INSTRUCTIONS + METHOD


In [None]:
import pandas as pd
import numpy as np
import os

# Load the dataset of images
file_path_data = '/home/user/spacehack/MeerLICHT_dataset/MeerLICHT_images.npy'
file_path_labels_csv = '/home/user/spacehack/MeerLICHT_dataset/MeerLICHT_labels.csv'
predictions_file = '/home/user/spacehack/MeerLICHT_predictions.csv'

# Load image triplets (New, Reference, Difference)
triplets = np.load(file_path_data)

# Load labels and predictions from CSV files
labels_df = pd.read_csv(file_path_labels_csv)
predictions_df = pd.read_csv(predictions_file)

# Check if table already exists in BigQuery, and upload labels if not
labels_id = "MeerLICHT_labels_df"
labels_ref = bq_client.dataset(DATASET_ID).table(labels_id)
create_table_flag = if_tbl_exists(bq_client, labels_ref)

if not create_table_flag:
    bq_client.load_table_from_dataframe(labels_df, labels_ref)

# Sample indexes for saving example images
sample_indexes = [0, 1, 3, 4, 8, 48, 77, 1179, 1180, 1181, 1191, 1193, 592, 685, 3216]

# Function to save image triplets for given indexes
def save_triplet_images(triplets, index, save_dir="output_images"):
    """Save New, Reference, and Difference images for a given index."""
    os.makedirs(save_dir, exist_ok=True)
    new_img, ref_img, diff_img = triplets[index]
    # Save each image with appropriate naming
    np.save(os.path.join(save_dir, f"new_{index}.npy"), new_img)
    np.save(os.path.join(save_dir, f"reference_{index}.npy"), ref_img)
    np.save(os.path.join(save_dir, f"difference_{index}.npy"), diff_img)

# Save sample images for visual inspection
# for i in sample_indexes:
#     save_triplet_images(triplets, i)

# Filter out corrupt data entries
valid_indexes = np.where(~np.isnan(triplets).any(axis=(1, 2, 3)))[0]

# Prepare data for LLM evaluation
evaluation_data = []
for idx in valid_indexes:
    # Retrieve the corresponding prediction data by index
    pred_row = predictions_df[predictions_df['index_no'] == idx]
    if not pred_row.empty:
        # Collect the triplet images and prediction details
        new_img, ref_img, diff_img = triplets[idx]
        explanation = pred_row['explanation'].values[0]
        interest_score = pred_row['interest_score'].values[0]
        
        # Append data for LLM evaluation
        evaluation_data.append({
            'index_no': idx,
            'new_image': new_img,
            'reference_image': ref_img,
            'difference_image': diff_img,
            'explanation': explanation,
            'interest_score': interest_score
        })

# Example output ready for LLM processing
# evaluation_data is now a list of dictionaries containing image triplets and predictions details


In [None]:
# Updated descriptions for the new task format

## DESCRIPTION INDEX 0:
desc1 = {
  "coherence_score": 4,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1:
desc2 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 3:
desc3 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 4:
desc4 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 8:
desc5 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 48:
desc6 = {
  "coherence_score": 4,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 77:
desc7 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1179:
desc8 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1180:
desc9 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1181:
desc10 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1191:
desc11 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 1193:
desc12 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 592:
desc13 = {
  "coherence_score": 4,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 685:
desc14 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

## DESCRIPTION INDEX 3216:
desc15 = {
  "coherence_score": 5,
  "interest_score_coherent": "Yes"
}

descriptions = [
    desc1, desc2, desc3, desc4, desc5, desc6, desc7, 
    desc8, desc9, desc10, desc11, desc12, desc13, desc14, desc15
]


In [None]:
# Report 15 examples for the dynamic prompt
EXAMPLES = ["<EXAMPLES>\n"]
for i in range(len(sample_indexes)):
    
    str_EX = f"""Example {i+1}:
    """
    all_list = create_ex(sample_indexes[i], True)
    all_list.insert(0, str_EX)
    all_list.append(descriptions[i])
    all
    for k in all_list:
        EXAMPLES.append(k)
EXAMPLES.append("\n</EXAMPLES>\n")

In [None]:
# Start logging the experiment

## Prepare the variables
timestamp = datetime.datetime.now()
formatted_datetime = timestamp.strftime('%Y%m%d%H%M')


## Log the experiments variables
### Create the run name with timestamp
run_name = "run" + formatted_datetime
DESCRIPTION = """Changed the instructions to give more structure.
The hyperparameters are: 
Temperature: .8
Top P: 1
""" # @param {type:"string"}
MODEL = "gemini-1.5-pro-002" # @param [gemini-1.5-pro-001", "gemini-1.5-flash-001", "gemini-1.0-pro-002"]
TEMPERATURE = 0.1 # @param {type:"slider", min:0, max:2, step:0.1}
TOP_P = 0.5 # @param {type:"slider", min:0, max:1, step:0.05}
PROMPT_FILE = save_prompt(stat_prompt + '\n'.join([a + "\n" + b + "\n" for (a,b) in example_description]), run_name)

# Build the experimentation variables
exp_vars = build_experiment_vars(description=DESCRIPTION,prompt=PROMPT_FILE, model=MODEL, temperature=TEMPERATURE, top_p=TOP_P)
# Start the run
aiplatform.start_run(run_name)
# Log the experiment variables
aiplatform.log_params(exp_vars)