In [1]:
from google.cloud import storage
import os
from PIL import Image
import numpy as np
import pandas as pd
import json

In [2]:
# Cleaned Prompt, Panstar

PERSONA = """<PERSONA>
You are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.
</PERSONA>"""

TASK = """<TASK>
Your task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. Respond in json format
</TASK>\n
"""

INSTRUCTIONS = """\n<INSTRUCTIONS>
**1. Purpose**
Help vet astronomical data for the Real/Bogus classification. The goal is for you to use your expertise to distinguish between real and bogus sources.

**2. Information Provided**
You will be shown three astronomical image cutouts:
a) **New Image:** The newest image centered at the location of the suspected transient source.
b) **Reference Image:** A reference image from the same telescope of the same part of the sky to be used for comparison. It shows if the source was already there in the past or not.
c) **Difference Image:** The residual image after the new and reference images are subtracted. Real sources should appear in this cutout as circular objects with only positive (white pixels) flux.

**3. Criteria for Classification**
- **Real Source:**
  - **Shape:** Circular shape at the center of the cutout with a visual extent of ~5-10 pixels, varying with focus conditions.
  - **Brightness:** Positive flux source (white pixels) in the New image. Positive flux source in the Difference image.
  - **Presence:** A real source will appear in the New image. This source can either be a supernova on a galaxy or a cataclysmic variable.

- **Bogus Source:**
  - **Shape:** Non-circular shape (e.g., elongated). This includes irregular shapes, positive or negative, like streaks or lines caused by cosmic-rays, diffraction spikes and cross-talk.
  - **Brightness:** Source not present or too faint at the center of the cutout. The source at the center can never be negative in the Difference image.

**4. Additional Guidance**
- **Contextual Information:** Focus on the source at the center of the cutouts inside the red circle, but consider nearby sources to diagnose potential problems.
- **Examples:** Refer to provided visual examples of real and bogus sources to aid in identification.
- **Judgment Criteria:** For ambiguous cases or borderline scenarios, consider the overall context and consistency with known characteristics of real and bogus sources.
</INSTRUCTIONS>"""


METHOD = """<METHOD>
1. **Focus on the Red Circle**: Start by examining the source located at the center of the cutout and inside the red circle. The images are prepared so that the source of interest is clearly marked for you to analyze.

2. **Analyze Each Image Individually**:
   - **New Image**: Check for the presence, shape, and brightness of the source in the New image.
   - **Reference Image**: Compare the source's properties in the Reference image to those in the New image.
   - **Difference Image**: Observe the residuals that result from subtracting the reference image from the New image. Look for patterns (circular, positive flux) that match characteristics of Real sources.

3. **Evaluate Features**:
   - Examine the shape, brightness, and other relevant features (e.g., artifacts, misalignments) of the source in each image.
   - Determine if these features are consistent with a Real or Bogus classification based on the criteria provided in the instructions.

4. **Consider Relationships Between Images**:
   - Compare the New, Reference, and difference images to understand any changes in the source over time.
   - Look for discrepancies or confirmations that might support or contradict a particular classification.

5. **Employ a Chain-of-Thought Reasoning**:
   - Clearly outline each observation you make and explain how it contributes to your decision-making process.
   - If you find any contradictions or ambiguous features, acknowledge them and provide reasoning for your final decision.

6. **Prepare the Final Output in JSON Format**:
   - Format your response as a JSON object containing:
     - The classification ('Real' or 'Bogus').


</METHOD>
"""

"""
7. **Example Output**:
   - Refer to the provided examples to see the expected format and detail level of your response.
"""

# Collapse the System Instructions into a single variable
stat_prompt_Panst = PERSONA + TASK + INSTRUCTIONS + METHOD



PERSONA0 = """<PERSONA>
You are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.
</PERSONA>"""

TASK0 = """<TASK>
Your task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. Respond in json format
</TASK>\n
"""

INSTRUCTIONS0 = """\n<INSTRUCTIONS>
**1. Purpose**
Help vet astronomical data for the Real/Bogus classification. The goal is for you to use your expertise to distinguish between real and bogus sources. 

**2. Information Provided**
You will be shown three astronomical image cutouts:
a) **New Image:** The newest image centered at the location of the suspected transient source. 
b) **Reference Image:** A reference image from the same telescope of the same part of the sky to be used for comparison. It shows if the source was already there in the past or not.
c) **Difference Image:** The residual image after the new and reference images are subtracted. Real sources should appear in this cutout as circular objects with only positive (white pixels) or only negative (black pixels) flux. 

**3. Criteria for Classification**
- **Real Source:** 
  - **Shape:** Circular shape at the center of the cutout with a visual extent of ~5-10 pixels, varying with focus conditions.
  - **Brightness:** Positive flux (white pixels) in either the new or reference image. Positive or negative flux in the Difference image. 
  - **Variability:** The source at the center can fade or brighten between the new and reference images, appearing as positive or negative in the Difference image.
  - **Presence:** The source may (dis)appear between the new and reference images. A source may also appear on top of an underlying source (e.g., supernova on a galaxy).

- **Bogus Source:** 
  - **Shape:** Non-circular shape (e.g., elongated). This includes irregular shapes, positive or negative, like streaks or lines caused by cosmic-rays, diffraction spikes and cross-talk.
  - **Brightness:** Negative flux (black pixels) at the center of the cutout in either the new or reference image. The source at the center can never be negative in the New or Reference image, only in the Differnece.
  - **Misalignment:** If the source in the New and Reference images is misaligned, it will show a Yin-Yang pattern (both white and black) in the Difference image.

**4. Additional Guidance** 
- **Contextual Information:** Focus on the source at the center of the cutouts inside the red circle, but consider nearby sources to diagnose potential problems.  
- **Examples:** Refer to provided visual examples of real and bogus sources to aid in identification. 
- **Judgment Criteria:** For ambiguous cases or borderline scenarios, consider the overall context and consistency with known characteristics of real and bogus sources.
</INSTRUCTIONS>"""


METHOD0 = """<METHOD>
1. **Focus on the Red Circle**: Start by examining the source located at the center of the cutout and inside the red circle. The images are prepared so that the source of interest is clearly marked for you to analyze.

2. **Analyze Each Image Individually**:
   - **New Image**: Check for the presence, shape, and brightness of the source in the new image.
   - **Reference Image**: Compare the source's properties in the reference image to those in the new image.
   - **Difference Image**: Observe the residuals that result from subtracting the reference image from the new image. Look for patterns (circular, positive/negative flux) that match characteristics of Real or Bogus sources.

3. **Evaluate Features**:
   - Examine the shape, brightness, and other relevant features (e.g., artifacts, misalignments) of the source in each image.
   - Determine if these features are consistent with a Real or Bogus classification based on the criteria provided in the instructions.

4. **Consider Relationships Between Images**:
   - Compare the new, reference, and difference images to understand any changes in the source over time.
   - Look for discrepancies or confirmations that might support or contradict a particular classification.

5. **Employ a Chain-of-Thought Reasoning**:
   - Clearly outline each observation you make and explain how it contributes to your decision-making process.
   - If you find any contradictions or ambiguous features, acknowledge them and provide reasoning for your final decision.

6. **Prepare the Final Output in JSON Format**:
   - Format your response as a JSON object containing:
     - The classification ('Real' or 'Bogus').

</METHOD>
"""

# Collapse the System Instructions into a single variable
stat_prompt_meer = PERSONA0 + TASK0 + INSTRUCTIONS0 + METHOD0

PERSONA1 = """<PERSONA>
You are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.
</PERSONA>"""

TASK1 = """<TASK>
Your task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. Respond in json format
</TASK>\n
"""

INSTRUCTIONS1 = """\n<INSTRUCTIONS>
**1. Purpose**
Help vet astronomical data for the Real/Bogus classification. The goal is for you to use your expertise to distinguish between real and bogus sources. 

**2. Information Provided**
You will be shown three astronomical image cutouts:
a) **New Image:** The newest image centered at the location of the suspected transient source. 
b) **Reference Image:** A reference image from the same telescope of the same part of the sky to be used for comparison. It shows if the source was already there in the past or not.
c) **Difference Image:** The residual image after the new and reference images are subtracted. Real sources should appear in this cutout as circular objects with only positive (white pixels) flux. 

**3. Criteria for Classification**
- **Real Source:** 
  - **Shape:** Circular shape at the center of the cutout with a visual extent of ~2-6 pixels, varying with focus conditions.
  - **Brightness:** Positive flux source (white pixels) in the New image. Positive flux source in the Difference image. 
  - **Presence:** A real source will appear in the New image. This source can either be a supernova on a galaxy or a cataclysmic variable.

- **Bogus Source:** 
  - **Shape:** Non-circular shape (e.g., elongated). This includes irregular shapes, positive or negative, like streaks or lines caused by cosmic-rays, diffraction spikes and cross-talk.
  - **Brightness:** Source not present or too faint at the center of the cutout. The source at the center can never be negative in the Difference image.

**4. Additional Guidance** 
- **Contextual Information:** Focus on the source at the center of the cutouts inside the red circle, but consider nearby sources to diagnose potential problems.  
- **Examples:** Refer to provided visual examples of real and bogus sources to aid in identification. 
- **Judgment Criteria:** For ambiguous cases or borderline scenarios, consider the overall context and consistency with known characteristics of real and bogus sources.
</INSTRUCTIONS>"""


METHOD1 = """<METHOD>
1. **Focus on the Red Circle**: Start by examining the source located at the center of the cutout and inside the red circle. The images are prepared so that the source of interest is clearly marked for you to analyze.

2. **Analyze Each Image Individually**:
   - **New Image**: Check for the presence, shape, and brightness of the source in the New image.
   - **Reference Image**: Compare the source's properties in the Reference image to those in the New image.
   - **Difference Image**: Observe the residuals that result from subtracting the reference image from the New image. Look for patterns (circular, positive flux) that match characteristics of Real sources.

3. **Evaluate Features**:
   - Examine the shape, brightness, and other relevant features (e.g., artifacts, misalignments) of the source in each image.
   - Determine if these features are consistent with a Real or Bogus classification based on the criteria provided in the instructions.

4. **Consider Relationships Between Images**:
   - Compare the New, Reference, and difference images to understand any changes in the source over time.
   - Look for discrepancies or confirmations that might support or contradict a particular classification.

5. **Employ a Chain-of-Thought Reasoning**:
   - Clearly outline each observation you make and explain how it contributes to your decision-making process.
   - If you find any contradictions or ambiguous features, acknowledge them and provide reasoning for your final decision.

6. **Prepare the Final Output in JSON Format**:
   - Format your response as a JSON object containing:
     - The classification ('Real' or 'Bogus').


</METHOD>
"""

# Collapse the System Instructions into a single variable
stat_prompt_atlas = PERSONA1 + TASK1 + INSTRUCTIONS1 + METHOD1



In [None]:

if not os.path.exists("/home/user/spacehack/myData/"):
    os.mkdir("/home/user/spacehack/myData")

def get_image(s = "PANSTARRS"): # Default Panstar
    
    if not os.path.exists("/home/user/spacehack/myData/"+s+"_Img"):

        if s == "PANSTARRS":
            id = "1yPzvawZSM0J2DVHcphMyiAco_ZLsUUsP"
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Img")
        elif s =="MeerLICHT":
            id = "1EZZyK_E99H--7yrTumYnuogrpd8F-KOv"
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Img")
        elif s == "ATLAS":
            id = "11fC4Kxlf6QbtlAejYfq_2N_groddSrcz"
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Img")
        else:
            return "Valid Choices: \nPANSTARRS \nMeerLICHT \nATLAS"

      


    img = np.load("/home/user/spacehack/myData/"+s+"_Img")
    return img

img_array = get_image("MeerLICHT")
#print(img.shape)
def get_labels(s = "PANSTARRS"):

    if not os.path.exists("/home/user/spacehack/myData/"+s+"_Labels"):

        if s == "PANSTARRS":
            id = "1XZmqO7xbTdqb-sbosfe5NpdAUeKcB20T" 
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Labels")
        elif s =="MeerLICHT":
            id = "11qLdAGY-_v8wC4IzE9CFOIywapSFWr7g"
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Labels")
        elif s =="ATLAS":
            id = "1GCl0CZkig2f82-69bkN84xrXpsHA7qgM"
            gdown.download(id=id, output = "/home/user/spacehack/myData/"+s+"_Labels")
        else:
            return "Valid Choices: \nPANSTARRS \nMeerLICHT \nATLAS"
    
    Labels = pd.read_csv("/home/user/spacehack/myData/"+s+"_Labels").set_index("index_no")
    return Labels

labels = get_labels("MeerLICHT")

def upload_image_to_gcs(image_array, bucket_name, destination_blob_name):
    """
    Uploads a 2D image array to a specified Google Cloud Storage bucket.

    Parameters:
    - image_array (np.array): 2D array representing the image to upload.
    - bucket_name (str): The name of the GCS bucket.
    - destination_blob_name (str): The destination path in the GCS bucket.

    Returns:
    - gcs_uri (str): The GCS URI of the uploaded file.
    """
    # Convert image array to a PIL image
    image = Image.fromarray(np.uint8(image_array))

    # Save image temporarily
    temp_image_path = "temp_image.jpeg"
    image.save(temp_image_path, format='JPEG')

    # Initialize GCS client and upload the file
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(temp_image_path)

    # Remove the temporary image file after upload
    os.remove(temp_image_path)

    # Return the GCS URI
    gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"
    return gcs_uri

def generate_json_for_index(index, images, labels, bucket_name, telescope = "PANSTARRS"):
    """
    Generate a JSON structure for the image, prompt, and label at the given index, uploading images to GCS.

    Parameters:
    - index (int): The index corresponding to the image and label.
    - images (np.array): 4D array of images with shape (1998, 100, 100, 3).
    - labels (pd.DataFrame): DataFrame containing the text labels, indexed to match the images.
    - bucket_name (str): The name of the GCS bucket to upload images to.

    Returns:
    - json_structure (dict): The structured JSON data for the specified index.
    """

    # Verify the index is within bounds
    if index < 0 or index >= images.shape[0]:
        raise IndexError(f"Index {index} is out of bounds for the provided images array.")

    # Verify the index exists in the labels DataFrame
    if index not in labels.index:
        raise ValueError(f"Index {index} not found in labels DataFrame.")
    
    # Retrieve each type of image
    new_image = images[index, :, :, 0]
    reference_image = images[index, :, :, 1]
    difference_image = images[index, :, :, 2]

    # Upload images to GCS and get their URIs
    new_image_uri = upload_image_to_gcs(new_image, bucket_name, f"image_{index}_new.jpeg")
    reference_image_uri = upload_image_to_gcs(reference_image, bucket_name, f"image_{index}_reference.jpeg")
    difference_image_uri = upload_image_to_gcs(difference_image, bucket_name, f"image_{index}_difference.jpeg")

    # Get the corresponding label from the DataFrame
    label_text = labels.loc[index, 'label']  

    # Define the prompt text (customize as needed)
    prompt_text = ""
    if telescope == "PANSTARRS":    
        prompt_text = stat_prompt_Panst
    elif telescope == "MeerLICHT":
        prompt_text = stat_prompt_meer
    elif telescope == "ATLAS":
        prompt_text = stat_prompt_atlas
    else:
        return "Valid Choices: \nPANSTARRS \nMeerLICHT \nATLAS"
    # Create the JSON structure expected by Gemini
    json_structure = {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {
                        "text" : "New Image"
                    },
                    {
                        "fileData": {
                            "mimeType": "image/jpeg",
                            "fileUri": new_image_uri,  # GCS URI of the new image
                        }
                    },
                    {
                        "text" : "Reference Image"
                    },
                    {
                        "fileData": {
                            "mimeType": "image/jpeg",
                            "fileUri": reference_image_uri,  # GCS URI of the reference image
                        }
                    },
                    {
                        "text" : "Difference Image"
                    },
                    {
                        "fileData": {
                            "mimeType": "image/jpeg",
                            "fileUri": difference_image_uri,  # GCS URI of the difference image
                        }
                    },
                    {
                        "text": prompt_text
                    }
                ]
            },
            {
                "role": "model",
                "parts": [
                    {
                        "text": f"classification : {label_text}"
                    }
                ]
            }
        ]
    }

    return json_structure

# Example usage:
json_result = generate_json_for_index(0, img_array, labels, 'spacehackimages', telescope = "MeerLICHT")
print(json.dumps(json_result, indent=2))



{
  "contents": [
    {
      "role": "user",
      "parts": [
        {
          "text": "New Image"
        },
        {
          "fileData": {
            "mimeType": "image/jpeg",
            "fileUri": "gs://spacehackimages/image_0_new.jpeg"
          }
        },
        {
          "text": "Reference Image"
        },
        {
          "fileData": {
            "mimeType": "image/jpeg",
            "fileUri": "gs://spacehackimages/image_0_reference.jpeg"
          }
        },
        {
          "text": "Difference Image"
        },
        {
          "fileData": {
            "mimeType": "image/jpeg",
            "fileUri": "gs://spacehackimages/image_0_difference.jpeg"
          }
        },
        {
          "text": "<PERSONA>\nYou are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making

In [10]:
def getNsamples(n = 200,s = "PANSTARRS"):
    if not os.path.exists("formatted"):
        os.mkdir("formatted")
        data = []
        if s == "PANSTARRS":
            for i in range(n):
                index = np.random.randint(0,1998)
                json_result = generate_json_for_index(index, img_array, labels, 'spacehackimages', telescope = s)
                filename = f"formatted/panstarr_{index}.json"
                with open(filename, 'w') as f:
                    json.dump(json_result, f, indent=4)  # indent for pretty formatting

        elif s =="MeerLICHT":
            for i in range(n):
                index = np.random.randint(0,1998)
                json_result = generate_json_for_index(index, img_array, labels, 'spacehackimages', telescope = s)
                print(json_result)
                data.append(json_result)
                                
        elif s == "ATLAS":
            for i in range(n):
                index = np.random.randint(0,1998)
                json_result = generate_json_for_index(index, img_array, labels, 'spacehackimages', telescope = s)
                filename = f"formatted/panstarr_{index}.json"
                with open(filename, 'w') as f:
                    json.dump(json_result, f, indent=4)
        else:
            return "Valid Choices: \nPANSTARRS \nMeerLICHT \nATLAS"
            
        with open("formatted/MeerLICHT.json", 'w') as f:
            json.dump(data, f, indent=4)



In [4]:
data = []
for i in range(200):
    index = np.random.randint(0,1998)
    json_result = generate_json_for_index(index, img_array, labels, 'spacehackimages', telescope = "MeerLICHT")
    data.append(json_result)
    with open("formatted/MeerLICHT_prompt_with_im_exp.jsonl", 'a') as f:
        json.dump(json_result, f, indent= None)
        if i == 199:
            break
        f.write("\n")

In [9]:
# GS creation

bucket_name = "spacehackimages"
destination_blob_name = "MeerLICHTFineTuning1"
client = storage.Client()
bucket = client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename("formatted/MeerLICHT.jsonl")
gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"


In [5]:

import time

import vertexai
from vertexai.tuning import sft

# TODO(developer): Update and un-comment below line
PROJECT_ID = "turan-genai-bb"
vertexai.init(project=PROJECT_ID, location="us-central1")

sft_tuning_job = sft.train(
    source_model="gemini-1.5-pro-002",
    train_dataset="gs://spacehackimages/MeerLICHT/MeerLICHT_prompt_with_im_exp.jsonl",
    # The following parameters are optional
    epochs=4,
    adapter_size=4,
    learning_rate_multiplier=1.0,
    tuned_model_display_name="tuned_gemini_1_5_pro_fine_tuned_with_modified_prompts",
)

# Polling for job completion
while not sft_tuning_job.has_ended:
    time.sleep(60)
    sft_tuning_job.refresh()

print(sft_tuning_job.tuned_model_name)
print(sft_tuning_job.tuned_model_endpoint_name)
print(sft_tuning_job.experiment)
# Example response:
# projects/123456789012/locations/us-central1/models/1234567890@1
# projects/123456789012/locations/us-central1/endpoints/123456789012345
# <google.cloud.aiplatform.metadata.experiment_resources.Experiment object at 0x7b5b4ae07af0>

Creating SupervisedTuningJob
SupervisedTuningJob created. Resource name: projects/355771430623/locations/us-central1/tuningJobs/6206736919682875392
To use this SupervisedTuningJob in another session:
tuning_job = sft.SupervisedTuningJob('projects/355771430623/locations/us-central1/tuningJobs/6206736919682875392')
View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/6206736919682875392?project=355771430623


projects/355771430623/locations/us-central1/models/6313700331410685952@1
projects/355771430623/locations/us-central1/endpoints/1222957096761294848
<google.cloud.aiplatform.metadata.experiment_resources.Experiment object at 0x799f74d03950>


In [6]:
import pandas as pd

train_samples = pd.read_json("formatted/MeerLICHT_prompt_with_im_exp.jsonl", lines = True)


In [10]:
sample = train_samples["contents"][0]
print(sample)

[{'role': 'user', 'parts': [{'text': 'New Image'}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_899_new.jpeg'}}, {'text': 'Reference Image'}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_899_reference.jpeg'}}, {'text': 'Difference Image'}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_899_difference.jpeg'}}, {'text': "<PERSONA>\nYou are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.\n</PERSONA><TASK>\nYour task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Prov

In [20]:
sample[0]["parts"][1]["fileData"]["fileUri"].split("_")[1]

'899'

In [21]:
sample_indices_used_in_training = []
for i in range(len(train_samples)):
    sample = train_samples["contents"][i]
    index = sample[0]["parts"][1]["fileData"]["fileUri"].split("_")[1]
    sample_indices_used_in_training.append(index)

print(sample_indices_used_in_training)

['899', '371', '1941', '1095', '1436', '622', '1162', '743', '1371', '1432', '357', '966', '479', '1938', '132', '1865', '128', '881', '1886', '1373', '363', '695', '1214', '914', '1391', '89', '1713', '321', '404', '659', '1849', '1469', '1127', '1897', '631', '998', '411', '946', '1117', '1073', '1820', '272', '1096', '346', '1859', '928', '1044', '1167', '1283', '1757', '858', '1380', '1709', '1571', '498', '1248', '1820', '1316', '697', '1496', '834', '402', '48', '1604', '1154', '269', '1440', '1065', '1169', '797', '452', '138', '1514', '1257', '762', '312', '1283', '613', '1640', '1628', '912', '1780', '13', '871', '1485', '278', '1448', '234', '572', '380', '1717', '127', '60', '1402', '1493', '1735', '1455', '1506', '526', '1402', '1766', '566', '601', '1633', '295', '872', '423', '280', '1843', '781', '1268', '735', '175', '921', '1083', '766', '1331', '461', '1551', '1720', '1655', '342', '1188', '285', '1166', '1356', '352', '770', '219', '1302', '77', '1467', '764', '1531'

In [22]:
np.random.seed(0)
data_test = []
for i in range(200):
    index = np.random.randint(0,1998)
    if index in sample_indices_used_in_training:
        while (index in sample_indices_used_in_training):
            index = np.random.randint(0,1998)
    sample_indices_used_in_training.append(index)
    json_result = generate_json_for_index(index, img_array, labels, 'spacehackimages', telescope = "MeerLICHT")
    data_test.append(json_result)
    with open("formatted/MeerLICHT_Test_with_mod_prompt.jsonl", 'a') as f:
        json.dump(json_result, f, indent= None)
        if i == 199:
            break
        f.write("\n")


In [59]:
tr_set =set(sample_indices_used_in_training[:201])
test_set = set(sample_indices_used_in_training[201:])

print(set.intersection(tr_set, test_set))

set()


In [4]:
from helper_functions import write_request
PROJECT_ID = "turan-genai-bb"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
EXPERIMENT_NAME = "supernovadetection" # @param {type:"string"}
# Make sure that dataset is created in Big Query
DATASET_ID = "spacehack" # @param {type:"string"}
write_request(name = "MeerLICHT_Test",
    model = "Already Provided",
    inputUri = "gs://spacehackimages/MeerLICHT/MeerLICHT_Test.jsonl",
    outputUri = "bq://" + f'{PROJECT_ID}.{DATASET_ID}.output_MeerLICHT_test')

In [21]:

import json

with open("formatted/MeerLICHT_Test.jsonl", "r") as file:
    for line in file:
        # Parse the JSON object in the current line
        json_obj = json.loads(line)
        real_label = json_obj["contents"][-1]["parts"][0]["text"].split(":")[-1]
        prompt = json_obj["contents"][0]
        
        break


{'role': 'user', 'parts': [{'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_new.jpeg'}}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_reference.jpeg'}}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_difference.jpeg'}}, {'text': "<PERSONA>\nYou are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.\n</PERSONA><TASK>\nYour task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. Respond 

In [37]:
import json
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting


def generate(prompt):
    vertexai.init(project="355771430623", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-pro-002",
    )
    responses = model.generate_content(
        [prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )

    for response in responses:
        print(response.text, end="")


generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

with open("formatted/MeerLICHT_Test.jsonl", "r") as file:
    for line in file:
        # Parse the JSON object in the current line
        json_obj = json.loads(line)
        real_label = json_obj["contents"][-1]["parts"][0]["text"].split(":")[-1]
        prompt = json_obj["contents"][0]
        print(real_label)
        print(prompt)
        break

 Bogus
{'role': 'user', 'parts': [{'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_new.jpeg'}}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_reference.jpeg'}}, {'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_difference.jpeg'}}, {'text': "<PERSONA>\nYou are an experienced astrophysicist, and your task is to classify astronomical transients into Real or Bogus based on a given set of 3 images. You have seen thousands of astronomical images during your lifetime and you are very good at making this classification by looking at the images and following the instructions.\n</PERSONA><TASK>\nYour task is to read the INSTRUCTIONS, look at the 3 images (New, Reference and Difference images) and classify if the source at the centre of the cutout and inside the red circle is a Real or Bogus astronomical transient. Provide your thought process to explain how you reasoned to provide the response. R

In [4]:
import json
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
from google.cloud import storage

# Initialize Vertex AI and Cloud Storage client
vertexai.init(project="355771430623", location="us-central1")
storage_client = storage.Client()

def generate(prompt_parts):  # Modified to accept parts
    model = GenerativeModel(
        "projects/355771430623/locations/us-central1/endpoints/1591900422484787200",
    )

    # Create the prompt with encoded image data
    prompt = []
    for part in prompt_parts:
        print(part)
        if "fileData" in part:
            file_uri = part["fileData"]["fileUri"]
            mime_type = part["fileData"]["mimeType"]

            # Fetch and encode the image data
            bucket_name, blob_name = file_uri[5:].split("/", 1) # Extract bucket and blob name
            bucket = storage_client.bucket(bucket_name)
            blob = bucket.blob(blob_name)
            image_bytes = blob.download_as_bytes()
            encoded_image = base64.b64encode(image_bytes).decode("utf-8")

            prompt.append(Part(text=encoded_image, mime_type=mime_type))

        elif "text" in part:
            prompt.append(Part(text=part["text"]))

    responses = model.generate_content(
        [prompt],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )

    for response in responses:
        print(response.text, end="")


# ... (rest of your code: generation_config, safety_settings) ...
generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

with open("formatted/MeerLICHT_Test.jsonl", "r") as file:
    for line in file:
        json_obj = json.loads(line)
        real_label = json_obj["contents"][-1]["parts"][0]["text"].split(":")[-1]
        prompt_parts = json_obj["contents"][0]["parts"] # Get the parts

        generate(prompt_parts)  # Pass the parts to generate()
        print(real_label)
        break


{'fileData': {'mimeType': 'image/jpeg', 'fileUri': 'gs://spacehackimages/image_82_new.jpeg'}}


TypeError: Part.__init__() got an unexpected keyword argument 'text'

In [1]:
import vertexai
print(vertexai.__version__)

1.70.0


In [42]:
import json
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
from google.cloud import storage


storage_client = storage.Client()

def get_image_as_base64(gs_uri):
    # Extract bucket name and file name from the gs:// link
    bucket_name, blob_name = gs_uri[5:].split("/", 1)
    
    # Get the bucket and blob (file object)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Download image as bytes
    image_bytes = blob.download_as_bytes()
    
    # Encode the image bytes into base64
    encoded_image = base64.b64encode(image_bytes).decode('utf-8')
    
    return encoded_image

with open("formatted/MeerLICHT_Test_with_mod_prompt.jsonl", "r") as file:
    for line in file:
        json_obj = json.loads(line)
        real_label = json_obj["contents"][-1]["parts"][0]["text"].split(":")[-1]
        

        
        prompt_parts_text = json_obj["contents"][0]["parts"][-1]["text"]
        images = json_obj["contents"][0]["parts"]
        new = images[1]["fileData"]["fileUri"]
        ref = images[3]["fileData"]["fileUri"]
        diff = images[5]["fileData"]["fileUri"]
        
        new_enc = get_image_as_base64(new)
        ref_enc = get_image_as_base64(ref)
        diff_enc = get_image_as_base64(diff)
    

        print(new.split("_")[1], ref.split("_")[1])
        
        break

684 684


In [45]:
import pandas as pd
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
from time import sleep

storage_client = storage.Client()


def generate(prompt_parts_text, new_enc, ref_enc, diff_enc):
    vertexai.init(project="355771430623", location="us-central1")
    model = GenerativeModel(
        "projects/355771430623/locations/us-central1/endpoints/1222957096761294848",
    )
    responses = model.generate_content(
        [prompt_parts_text, "New Image", new_enc, "Referance Image", ref_enc, "Difference Image", diff_enc],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )
    classification = ""
    for response in responses:
        classification += response.text
    return classification


generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]
def get_image_as_base64(gs_uri):
    # Extract bucket name and file name from the gs:// link
    bucket_name, blob_name = gs_uri[5:].split("/", 1)
    
    # Get the bucket and blob (file object)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    
    # Download image as bytes
    image_bytes = blob.download_as_bytes()
    
    # Encode the image bytes into base64
    encoded_image = base64.b64encode(image_bytes).decode('utf-8')
    
    return encoded_image
predictions = {}
with open("formatted/MeerLICHT_Test_with_mod_prompt.jsonl", "r") as file:
    for line in file:
        json_obj = json.loads(line)
        real_label = json_obj["contents"][-1]["parts"][0]["text"].split(":")[-1]
        
        

        
        prompt_parts_text = json_obj["contents"][0]["parts"][-1]["text"]
        images = json_obj["contents"][0]["parts"]
        new = images[1]["fileData"]["fileUri"]
        ref = images[3]["fileData"]["fileUri"]
        diff = images[5]["fileData"]["fileUri"]
        index = new.split("_")[1]
        
        new_enc = get_image_as_base64(new)
        ref_enc = get_image_as_base64(ref)
        diff_enc = get_image_as_base64(diff)
    
        model_pred = generate(prompt_parts_text, new_enc, ref_enc, diff_enc)
        predictions[index] = (model_pred.split(": ")[-1].split("\n")[0], real_label)
        print(model_pred)
        print(predictions)
        
        sleep(10)
        
        
        

classification : Real


{'684': ('Real', ' Bogus')}
classification : Real


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus')}
classification : Bogus


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real')}
classification : Bogus


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real'), '1216': ('Bogus', ' Real')}
classification : Bogus


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real'), '1216': ('Bogus', ' Real'), '835': ('Bogus', ' Bogus')}
classification : Real


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real'), '1216': ('Bogus', ' Real'), '835': ('Bogus', ' Bogus'), '763': ('Real', ' Bogus')}
classification : Bogus


{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real'), '1216': ('Bogus', ' Real'), '835': ('Bogus', ' Bogus'), '763': ('Real', ' Bogus'), '1731': ('Bogus', ' Real')}
classification : Real


{'684': ('Real', ' Bogus'), 

In [48]:
df.to_csv("results_with_mod_prompt.csv")
print(predictions)

{'684': ('Real', ' Bogus'), '559': ('Real', ' Bogus'), '1653': ('Bogus', ' Real'), '1216': ('Bogus', ' Real'), '835': ('Bogus', ' Bogus'), '763': ('Real', ' Bogus'), '1731': ('Bogus', ' Real'), '1383': ('Real', ' Real'), '1033': ('Real', ' Bogus'), '1747': ('Real', ' Real'), '277': ('Bogus', ' Bogus'), '1778': ('Bogus', ' Real'), '1828': ('Real', ' Real'), '599': ('Bogus', ' Bogus'), '1094': ('Bogus', ' Bogus'), '1496': ('Bogus', ' Real'), '600': ('Real', ' Bogus'), '1420': ('Real', ' Real'), '314': ('Real', ' Bogus'), '705': ('Bogus', ' Bogus'), '1510': ('Bogus', ' Real'), '551': ('Bogus', ' Bogus'), '87': ('Real', ' Bogus'), '174': ('Bogus', ' Bogus'), '1624': ('Bogus', ' Real'), '849': ('Real', ' Bogus'), '1701': ('Bogus', ' Real'), '537': ('Real', ' Bogus'), '845': ('Real', ' Bogus'), '72': ('Real', ' Bogus'), '777': ('Real', ' Bogus'), '1940': ('Bogus', ' Real'), '115': ('Bogus', ' Bogus'), '976': ('Bogus', ' Bogus'), '755': ('Real', ' Bogus'), '1733': ('Bogus', ' Real'), '1871': 

In [52]:
df = pd.DataFrame.from_dict(predictions, orient="index", columns=["Model Pred", "Real Label"])
acc = 0
for i in range(len(df)):
    print(df.iloc[i]["Model Pred"], df.iloc[i]["Real Label"])

    if df.iloc[i]["Model Pred"].strip(" ") == df.iloc[i]["Real Label"].strip(" "):
        print(df.iloc[i]["Model Pred"], df.iloc[i]["Real Label"])
        acc += 1
print(acc / len(df))


Real  Bogus
Real  Bogus
Bogus  Real
Bogus  Real
Bogus  Bogus
Bogus  Bogus
Real  Bogus
Bogus  Real
Real  Real
Real  Real
Real  Bogus
Real  Real
Real  Real
Bogus  Bogus
Bogus  Bogus
Bogus  Real
Real  Real
Real  Real
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Real
Real  Bogus
Real  Real
Real  Real
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Real
Bogus  Bogus
Bogus  Bogus
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Real
Real  Bogus
Bogus  Real
Real  Bogus
Real  Bogus
Real  Bogus
Real  Bogus
Bogus  Real
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Real  Bogus
Bogus  Real
Real  Real
Real  Real
Bogus  Real
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Real  Bogus
Real  Real
Real  Real
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Real
Bogus  Real
Bogus  Real
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Real  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bogus  Bogus
Bog