In [None]:
# -----------------------------
# Step 1: Upload your service account key file
# -----------------------------
from google.colab import files
uploaded = files.upload()  # Upload your JSON key file when prompted.
# Assume the key file is the first file you uploaded.
key_filename = list(uploaded.keys())[0]

# Set the environment variable so that Google Cloud libraries can authenticate.
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = f'/content/{key_filename}'

# -----------------------------
# Step 2: Mount your Google Drive to access CSV and images
# -----------------------------
from google.colab import drive
drive.mount('/content/drive')

# -----------------------------
# Step 3: Import necessary libraries and set configuration paths
# -----------------------------
import pandas as pd
import json
from google.cloud import storage

# Paths and bucket settings – adjust as needed.
csv_path = '/content/new.csv'  # CSV file path
images_folder = '/content/drive/MyDrive/sidewalk_images_only'  # Folder with your images on Drive
bucket_name = 'capstone-project-yatish'  # Your GCS bucket name
output_file = 'gemini_finetune_image_captioning.jsonl'  # Output JSONL file name

# -----------------------------
# Step 4: Load your CSV file using header row (header=0)
# -----------------------------
df = pd.read_csv(csv_path, header=0)
print("CSV columns:", df.columns.tolist())

# Verify the expected narrative column exists.
narrative_column = 'narrative 2'
if narrative_column not in df.columns:
    narrative_column = input(f"Column '{narrative_column}' not found. Please enter the correct narrative column name: ")

# -----------------------------
# Step 5: Initialize the Google Cloud Storage client and define image upload function
# -----------------------------
storage_client = storage.Client()

def upload_image_to_gcs(local_file_path, bucket_name, destination_blob_name):
    """
    Uploads an image from a local file path to a GCS bucket and returns its GCS URI.
    """
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(local_file_path)
    gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"
    print(f"Uploaded {local_file_path} to {gcs_uri}")
    return gcs_uri

# -----------------------------
# Step 6: Process each row in the CSV:
# - Convert '.txt' extension to '.jpg' if necessary.
# - Build the local image path.
# - Upload the image to GCS and capture its URI.
# -----------------------------
gcs_urls = []
for idx, row in df.iterrows():
    # Get the filename and ensure it's a string.
    filename = str(row['filename'])

    # If filename ends with '.txt', replace with '.jpg'
    if filename.endswith('.txt'):
        filename = filename.rsplit('.', 1)[0] + '.jpg'

    # Build the full local path to the image in Google Drive.
    local_image_path = os.path.join(images_folder, filename)

    # Check if the file exists; if not, log an error and use an empty string.
    if not os.path.exists(local_image_path):
        print(f"File not found: {local_image_path}")
        gcs_uri = ""
    else:
        gcs_uri = upload_image_to_gcs(local_image_path, bucket_name, filename)

    gcs_urls.append(gcs_uri)

# Add the new GCS URL column to the DataFrame.
df['image_url'] = gcs_urls

# (Optional) Save the updated CSV with the GCS URLs for reference.
df.to_csv('cleaned_with_urls.csv', index=False)

# -----------------------------
# Step 7: Create the Gemini fine-tuning JSONL dataset with the required structure.
# -----------------------------
# Each JSON record will include a top-level "contents" field containing:
# - A user message with two parts:
#     1. The image file data (GCS URI and MIME type).
#     2. A text prompt.
# - A model message with one part: the expected caption.
with open(output_file, 'w', encoding='utf-8') as f:
    for idx, row in df.iterrows():
        record = {
            "contents": [
                {
                    "role": "user",
                    "parts": [
                        {
                            "fileData": {
                                "mimeType": "image/jpeg",
                                "fileUri": str(row['image_url'])
                            }
                        },
                        {
                            "text": "Describe this image in detail."
                        }
                    ]
                },
                {
                    "role": "model",
                    "parts": [
                        {
                            "text": str(row[narrative_column])
                        }
                    ]
                }
            ]
        }
        # Write each record as a single line in the JSONL file.
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"Fine-tuning dataset created and saved as {output_file}")

Saving lofty-tea-453519-v8-553b9cdb5333.json to lofty-tea-453519-v8-553b9cdb5333.json
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CSV columns: ['filename', 'damage level', 'bounding box', 'narrative 1', 'narrative 2', 'narrative 3']
Uploaded /content/drive/MyDrive/sidewalk_images_only/gsv-cdmx-2125-SurfaceProblem_png.rf.f4a443db328ac740e6dbb271591af250.jpg to gs://capstone-project-yatish/gsv-cdmx-2125-SurfaceProblem_png.rf.f4a443db328ac740e6dbb271591af250.jpg
Uploaded /content/drive/MyDrive/sidewalk_images_only/gsv-cdmx-28678-SurfaceProblem_png.rf.3abe51950026af0de515d542139fdcc2.jpg to gs://capstone-project-yatish/gsv-cdmx-28678-SurfaceProblem_png.rf.3abe51950026af0de515d542139fdcc2.jpg
Uploaded /content/drive/MyDrive/sidewalk_images_only/gsv-cdmx-3371-SurfaceProblem_png.rf.90097b30caefa3433022bcd8e52988b0.jpg to gs://capstone-project-yatish/gsv-cdmx-3371-SurfaceProblem_png.rf.90097b30caefa3433022bcd8

In [None]:
import json

# Load the JSONL file
file_path = '/content/gemini_finetune_image_captioning.jsonl'

with open(file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Check the first few lines to ensure the structure and inspect for empty fileUri values
for line in lines[:5]:  # First 5 lines
    record = json.loads(line)
    print(json.dumps(record, indent=2))  # Print in pretty format

{
  "contents": [
    {
      "role": "user",
      "parts": [
        {
          "fileData": {
            "mimeType": "image/jpeg",
            "fileUri": "gs://capstone-project-yatish/gsv-cdmx-2125-SurfaceProblem_png.rf.f4a443db328ac740e6dbb271591af250.jpg"
          }
        },
        {
          "text": "Describe this image in detail."
        }
      ]
    },
    {
      "role": "model",
      "parts": [
        {
          "text": "There's some damage ahead on the sidewalk in front of you. You can walk over it as you normally would, but stay alert just in case. If you feel uncertain, you may want to veer slightly to your right to avoid it altogether. Enjoy your walk!"
        }
      ]
    }
  ]
}
{
  "contents": [
    {
      "role": "user",
      "parts": [
        {
          "fileData": {
            "mimeType": "image/jpeg",
            "fileUri": "gs://capstone-project-yatish/gsv-cdmx-28678-SurfaceProblem_png.rf.3abe51950026af0de515d542139fdcc2.jpg"
          }
        