In [9]:
%pip install -q -U requests pillow tqdm transformers ftfy regex sentence_transformers imagehash opencv-python simple_image_download google-generativeai google-cloud-secret-manager python-dotenv

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import os
import json
from PIL import Image
import google.generativeai as genai
from tqdm import tqdm
# load .env if present so GEMINI_API_KEY can be set there
from dotenv import load_dotenv
load_dotenv()

# =============================
# 1. CONFIG
# =============================

# Load GEMINI API key from Secret Manager or environment (do NOT store keys directly in the notebook)
def _get_gemini_key(project_id=None, secret_id="GEMINI_API_KEY"):
    # 1) env var
    key = os.getenv("GEMINI_API_KEY")
    if key:
        return key

    # 2) Colab Secrets (if running in Colab and you saved a secret with name 'GEMINI_API_KEY')
    try:
        from google.colab import userdata
        key = userdata.get("GEMINI_API_KEY")
        if key:
            return key
    except Exception:
        pass

    # 3) Google Secret Manager (requires GOOGLE_CLOUD_PROJECT or explicit project_id)
    try:
        from google.cloud import secretmanager
        project_id = project_id or os.getenv("GOOGLE_CLOUD_PROJECT") or os.getenv("GCLOUD_PROJECT")
        if not project_id:
            raise RuntimeError("No Google project id found for Secret Manager. Set environment variable GOOGLE_CLOUD_PROJECT or pass project_id.")
        client = secretmanager.SecretManagerServiceClient()
        name = f"projects/{project_id}/secrets/{secret_id}/versions/latest"
        response = client.access_secret_version(request={"name": name})
        return response.payload.data.decode("UTF-8")
    except Exception as e:
        # If we couldn't retrieve from Secret Manager, raise a clear error
        raise RuntimeError("Could not obtain GEMINI_API_KEY from env/Colab/Secret Manager: " + str(e))

# Retrieve key (this will raise if not found)
# Load GEMINI API key from local .env or environment (simplified for local runs)
# Do NOT hardcode the key in the notebook. Set GEMINI_API_KEY in d:\SE_Data\.env or as an env var.
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError(
    "GEMINI_API_KEY not found. Please add GEMINI_API_KEY=your_key to d:\SE_Data\.env or set the environment variable before running."
    )
INPUT_FOLDER = "D:\\SE_Data\\dataset_train"
OUTPUT_FOLDER = "D:\\SE_Data\\data\\ghibli\\train"
METADATA_FILE = os.path.join(OUTPUT_FOLDER, "metadata.jsonl")

# If you want to force starting over set FORCE_CLEAR_METADATA = True
FORCE_CLEAR_METADATA = False
RESUME = True  # if True, skip images already present in metadata.jsonl
# Auto-wait configuration: when quota is exhausted, optionally wait and retry
AUTO_WAIT_ON_QUOTA = True  # if True, wait then retry when ResourceExhausted is raised
QUOTA_WAIT_SECONDS = 600  # seconds to wait between quota-check retries (default 10 minutes)
QUOTA_MAX_RETRIES = 6  # how many wait+retry attempts before giving up

# START_FROM: if your source filenames are numeric (e.g. "281.jpg"), this will skip any source file with index < START_FROM
START_FROM = 281
# Limits to protect key/quota during a long run
MAX_API_CALLS_PER_RUN = 1000  # stop after this many successful API calls in one notebook run
MAX_RUN_SECONDS = 60 * 60 * 4  # stop after this many seconds of wall-time (default 4 hours)

CAPTION_PROMPT = """
You will receive an image. Describe it in a detailed Ghibli-style caption.
Rules:
- Start with: "Ghibli style ..." make sure there are no colons in the sentence.
- Do NOT include any character names, even if recognizable.
- Describe age, gender, facial features, eyes, nose, expression, hairstyle, and clothing.
- Describe posture or action.
- Describe the background environment with cinematic detail (light, mood, atmosphere).
- Use vivid, textured adjectives.
- Make the caption at least 30–45 words.
- Never mention Studio Ghibli or movie titles.
"""

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-2.5-flash")

# =============================
# 2. PREPARE OUTPUT STRUCTURE
# =============================

os.makedirs(OUTPUT_FOLDER, exist_ok=True)

if FORCE_CLEAR_METADATA:
    open(METADATA_FILE, "w").close()
else:
    # Ensure metadata file exists (but don't clear it)
    if not os.path.exists(METADATA_FILE):
        open(METADATA_FILE, "w").close()

# =============================
# 3. PROCESS IMAGES (with resume + quota handling)
# =============================

image_files = [
    f for f in os.listdir(INPUT_FOLDER)
    if f.lower().endswith((".jpg", ".jpeg", ".png"))
]
image_files.sort()

# Load already-processed filenames from metadata so we can resume
processed_files = set()
if RESUME and os.path.exists(METADATA_FILE):
    try:
        with open(METADATA_FILE, "r", encoding="utf-8") as mread:
            for line in mread:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    # Prefer source_file (original filename). Fallback to file_name for backward compatibility.
                    if "source_file" in obj:
                        processed_files.add(obj["source_file"])
                    elif "file_name" in obj:
                        processed_files.add(obj["file_name"])
                except Exception:
                    # ignore malformed lines
                    continue
    except Exception as e:
        print("Warning: could not read metadata file to resume:", e)

print("Found", len(image_files), "images. Already processed:", len(processed_files))

# Compute next output index based on existing output files to avoid collisions
existing_indices = []
for f in os.listdir(OUTPUT_FOLDER):
    if f.lower().endswith('.jpg') and f[:-4].isdigit():
        try:
            existing_indices.append(int(f[:-4]))
        except ValueError:
            continue
next_index = max(existing_indices) if existing_indices else 0

# Open metadata for appending and keep writing one line per successful caption.
import time
from google.api_core import exceptions as g_exc

start_time = time.time()
api_call_count = 0

with open(METADATA_FILE, "a", encoding="utf-8") as meta:

    quota_exhausted = False
    for filename in tqdm(image_files, desc="Processing"):
        # If START_FROM set and filename is numeric, skip those < START_FROM
        stem = os.path.splitext(filename)[0]
        try:
            stem_idx = int(stem)
        except Exception:
            stem_idx = None
        if START_FROM and stem_idx is not None and stem_idx < START_FROM:
            continue

        # If resuming, skip already-processed images (based on metadata file entries - which prefer source_file)
        if RESUME and filename in processed_files:
            continue

        # Stop if we've hit runtime or api call limits
        if api_call_count >= MAX_API_CALLS_PER_RUN:
            print(f"Reached MAX_API_CALLS_PER_RUN={MAX_API_CALLS_PER_RUN}. Stopping.")
            break
        if (time.time() - start_time) > MAX_RUN_SECONDS:
            print(f"Reached MAX_RUN_SECONDS={MAX_RUN_SECONDS}. Stopping.")
            break

        input_path = os.path.join(INPUT_FOLDER, filename)
        next_index += 1
        output_name = f"{next_index}.jpg"
        output_path = os.path.join(OUTPUT_FOLDER, output_name)

        # ---- Load image ----
        img = Image.open(input_path).convert("RGB")

        # ---- Save resized image into train folder ----
        img.save(output_path, "JPEG", quality=95)

        # ---- Ask Gemini for caption with retries/backoff ----
        caption = None
        max_retries = 5
        backoff = 2  # seconds, will grow exponentially
        for attempt in range(1, max_retries + 1):
            try:
                with open(output_path, "rb") as fimg:
                    response = model.generate_content(
                        [CAPTION_PROMPT, {"mime_type": "image/jpeg", "data": fimg.read()}],
                        safety_settings={"HARASSMENT": "BLOCK_NONE"},
                    )
                caption = response.text.strip()
                api_call_count += 1
                break  # success
            except g_exc.ResourceExhausted as e:
                # API quota exhausted for the project/account.
                if AUTO_WAIT_ON_QUOTA:
                    print("ResourceExhausted (quota). Will wait and retry according to QUOTA_WAIT_SECONDS / QUOTA_MAX_RETRIES.")
                    quota_retry = 0
                    caption = None
                    while quota_retry < QUOTA_MAX_RETRIES:
                        quota_retry += 1
                        print(f"Waiting {QUOTA_WAIT_SECONDS} seconds before retry attempt {quota_retry}/{QUOTA_MAX_RETRIES}...")
                        time.sleep(QUOTA_WAIT_SECONDS)
                        try:
                            with open(output_path, "rb") as fimg:
                                response = model.generate_content(
                                    [CAPTION_PROMPT, {"mime_type": "image/jpeg", "data": fimg.read()}],
                                    safety_settings={"HARASSMENT": "BLOCK_NONE"},
                                )
                            caption = response.text.strip()
                            api_call_count += 1
                            print("Succeeded after waiting. Continuing processing.")
                            break
                        except g_exc.ResourceExhausted:
                            print(f"Still ResourceExhausted after wait attempt {quota_retry}.")
                            continue
                        except Exception as e2:
                            # other transient error while retrying after wait; fall back to outer retry logic
                            print(f"Error when retrying after wait: {e2}. Will continue with retry/backoff.")
                            break

                    if caption is not None:
                        # success after waiting, proceed to save caption
                        pass
                    else:
                        print("Quota still exhausted after configured retries. Stopping further processing. Processed results are saved.")
                        quota_exhausted = True
                        break
                else:
                    print("ResourceExhausted (quota). Stopping further requests. Processed results are saved.")
                    quota_exhausted = True
                    break
            except g_exc.InternalServerError as e:
                # server side error, retry
                print(f"InternalServerError on attempt {attempt}: {e}. Retrying after {backoff} s...")
            except g_exc.ServiceUnavailable as e:
                print(f"ServiceUnavailable on attempt {attempt}: {e}. Retrying after {backoff} s...")
            except Exception as e:
                # Generic exception (network, timeouts, rate limits). Retry a few times before giving up
                print(f"Error on attempt {attempt}: {e}. Retrying after {backoff} s...")

            # backoff before next retry (unless we're on last attempt)
            if attempt < max_retries:
                time.sleep(backoff)
                backoff *= 2

        if quota_exhausted:
            # Stop processing more images; keep everything written so far
            break

        if caption is None:
            # Could not produce a caption after retries. Save an entry indicating failure so we don't keep retrying forever.
            print(f"Failed to generate caption for {filename} after {max_retries} attempts. Saving placeholder and continuing.")
            caption = "[ERROR: failed to generate caption]"

        # ---- Save metadata ----
        entry = {"file_name": output_name, "text": caption}
        meta.write(json.dumps(entry, ensure_ascii=False) + "\n")
        meta.flush()
        try:
            os.fsync(meta.fileno())
        except Exception:
            # fsync may not be available on all systems; ignore if it fails
            pass

print("✔ Done (or stopped due to quota). Processed metadata written to:", METADATA_FILE)


Found 352 images. Already processed: 279


Processing: 100%|██████████| 352/352 [03:18<00:00,  1.78it/s]

✔ Done (or stopped due to quota). Processed metadata written to: D:\SE_Data\data\ghibli\train\metadata.jsonl



