In [None]:
import google.generativeai as genai

genai.configure(api_key="API-KEY") #replace this with your gemini API key

model = genai.GenerativeModel("gemini-1.5-flash")

In [2]:
import pandas as pd

metadata_path = "path to image input directory" #replace with the path of the file where we have the images 
df = pd.read_csv(metadata_path)
df.head()
print("Total images:", len(df))

image_base_path = "path for the small directory in the abo-small dataset"

Total images: 20000


In [3]:
import json

# Load the JSON file from the Kaggle dataset directory
with open("path to the JSON file", "r") as f:
    data = json.load(f)

print(f"Total entries: {len(data)}")

# Example: Access product data using an image_id
sample_id = list(data.keys())[0]
print("Sample image_id:", sample_id)
print("Associated product data:", data[sample_id])


Total entries: 398170
Sample image_id: 81iZlv3bjpL
Associated product data: {'brand': [{'language_tag': 'en_GB', 'value': 'find.'}], 'item_id': 'B06X9WQGQP', 'item_name': [{'language_tag': 'en_GB', 'value': 'find. Women’s Ari Heeled Closed-Toe Heels'}], 'model_name': [{'language_tag': 'en_GB', 'value': 'Ari Heeled'}], 'model_year': [{'value': 2017}], 'product_type': [{'value': 'SHOES'}], 'style': [{'language_tag': 'en_GB', 'value': 'Closed-Toe Pumps'}], 'main_image_id': '81iZlv3bjpL', 'other_image_id': ['91mIRxgziUL', '91eqBkW06wL', 'A1BHZSKNbkL'], 'country': 'GB', 'marketplace': 'Amazon', 'domain_name': 'amazon.co.uk', 'node': [{'node_id': 1769851031, 'node_name': "/Categories/Shoes/Women's Shoes/Court Shoes"}]}


In [4]:
image_base_path = "path for the small directory in the abo-small dataset"

subset_df = df.iloc[7603:10903].reset_index(drop=True) # 3300

In [5]:
sample_df = subset_df.sample(n=1000, random_state=7).reset_index(drop=True)

In [6]:
import os
import re
import csv
import time
import json
import pandas as pd
from PIL import Image
from tqdm import tqdm


def clean(text):
    text = re.sub(r"\*+", "", text)  # Remove asterisks used for markdown
    text = re.sub(r"\n+", " ", text)  # Remove excessive newlines
    text = re.sub(r"\s+", " ", text)  # Collapse all whitespace
    text = re.sub(r"^\W+|\W+$", "", text)  # Remove non-alphanum from ends
    return text.strip()


def extract_english_value(entry_list):
    if not isinstance(entry_list, list):
        return None
    for entry in entry_list:
        lang = entry.get("language_tag", "")
        if lang.startswith("en_"):
            return entry.get("value")
    return None



def get_metadata_string(metadata):
    fields = [
        "brand", "bullet_point", "color", "fabric_type", "finish_type", "item_dimensions",
        "item_keywords", "item_shape", "item_weight", "material", "model_name",
        "pattern", "product_description", "style"
    ]

    result = []
    for field in fields:
        value = metadata.get(field)
        if value is None:
            continue

        if isinstance(value, list):
            if isinstance(value[0], dict):
                val = extract_english_value(value)
                if val:
                    result.append(f"{field.replace('_', ' ').title()}: {val}")
            elif isinstance(value[0], str):
                result.append(f"{field.replace('_', ' ').title()}: {', '.join(value)}")

        elif isinstance(value, dict):  # item_dimensions
            dims = []
            for k, v in value.items():
                norm = v.get("normalized_value", {})
                unit = norm.get("unit", v.get("unit"))
                val = norm.get("value", v.get("value"))
                dims.append(f"{k.title()}: {val} {unit}")
            result.append(f"{field.replace('_', ' ').title()}: {', '.join(dims)}")

    return "\n".join(result)


# Settings
output_csv = "path to the putput file"
pause_every_n_images = 4
rate_limit_wait = 60
# subset_df = sample_df.iloc[369:1000].reset_index(drop=True)
subset_df = sample_df

# Ensure the CSV is reset at the start
if os.path.exists(output_csv):
    os.remove(output_csv)

vqa_data = []
image_counter = 0
difficulties = ["Easy", "Medium", "Hard"]

for idx, row in tqdm(subset_df.iterrows(), total=len(subset_df)):
    image_id = str(row["image_id"])
    image_path = os.path.join(image_base_path, row["path"])

    product_metadata = data.get(image_id)
    if not product_metadata:
        print(f"[SKIP] No metadata found for image_id: {image_id}")
        continue

    english_fields = {
        "item_name": extract_english_value(product_metadata.get("item_name", [])),
        "brand": extract_english_value(product_metadata.get("brand", [])),
        "bullet_point": extract_english_value(product_metadata.get("bullet_point", [])),
        "color": extract_english_value(product_metadata.get("color", [])),
        "style": extract_english_value(product_metadata.get("style", [])),
    }

    if not any(english_fields.values()):
        lang_info = {}
        for field in ["item_name", "brand", "bullet_point", "color", "style"]:
            entries = product_metadata.get(field, [])
            if isinstance(entries, list):
                langs = [e.get("language_tag", "unknown") for e in entries if isinstance(e, dict)]
                if langs:
                    lang_info[field] = langs
        print(f"[SKIP] No English entries for image_id: {image_id}. Found language tags: {json.dumps(lang_info)}")
        continue


    metadata_str = get_metadata_string(product_metadata)
    if not metadata_str:
        print(f"[SKIP] Metadata string is empty for image_id: {image_id}")
        continue

    try:
        image = Image.open(image_path)
    except Exception as e:
        print(f"[SKIP] Failed to open image at path: {image_path} (image_id: {image_id}) | Error: {e}")
        continue

    prompt = (
        "You are a Visual QA dataset assistant: for each image, generate exactly three "
        "question-answer pairs, with a single word as the answer — in Easy (basic visual attributes), "
        "Medium (advanced visual features), and Hard (logical inferences) difficulty—covering diverse types, "
        "answerable solely from the image. Format: Question: ..., Answer: ...\n\n"
        f"Product metadata:\n{metadata_str}"
    )

    response = model.generate_content([image, prompt])
    text = response.text

    pairs = re.findall(
        r"Question:\s*(.*?)\s*Answer:\s*(.*?)(?=(?:Question:|$))",
        text, re.DOTALL
    )

    if not pairs:
        print(f"[SKIP] No valid QA pairs returned for image_id: {image_id}")
        continue

    for i, (raw_q, raw_a) in enumerate(pairs):
        q = clean(raw_q)
        a = clean(raw_a)
        q = re.sub(r"\b(?:Easy|Medium|Hard)\s*:\s*", "", raw_q.strip(), flags=re.IGNORECASE)
        a = re.sub(r"\b(?:Easy|Medium|Hard)\s*:\s*", "", raw_a.strip(), flags=re.IGNORECASE)
        if q and a:
            difficulty = difficulties[i % 3]
            vqa_data.append({
                "image_id": image_id,
                "question": clean(q),
                "answer": clean(a),
                "difficulty": difficulty
            })

    image_counter += 1

    if image_counter % pause_every_n_images == 0 and vqa_data:
        pd.DataFrame(vqa_data).to_csv(
            output_csv, mode='a',
            header=not os.path.exists(output_csv),
            index=False, quoting=csv.QUOTE_ALL,
            columns=["image_id", "question", "answer", "difficulty"]
        )
        vqa_data = []
        time.sleep(rate_limit_wait)

if vqa_data:
    pd.DataFrame(vqa_data).to_csv(
        output_csv, mode='a',
        header=not os.path.exists(output_csv),
        index=False, quoting=csv.QUOTE_ALL,
        columns=["image_id", "question", "answer", "difficulty"]
    )


  0%|          | 3/631 [00:03<12:33,  1.20s/it]

[SKIP] No English entries for image_id: 81lYWaNME9L. Found language tags: {"item_name": ["es_ES"], "brand": ["es_ES"], "bullet_point": ["es_ES", "es_ES", "es_ES", "es_ES", "es_ES"]}


  2%|▏         | 11/631 [02:12<2:04:22, 12.04s/it]

[SKIP] No English entries for image_id: 81WwmWOiYZL. Found language tags: {"item_name": ["de_DE"], "brand": ["de_DE"], "bullet_point": ["de_DE", "de_DE", "de_DE"]}


  3%|▎         | 19/631 [04:21<2:54:51, 17.14s/it]

[SKIP] No English entries for image_id: 81TLCuavmjL. Found language tags: {"item_name": ["es_MX"], "brand": ["es_MX"], "bullet_point": ["es_MX", "es_MX", "es_MX", "es_MX", "es_MX"]}


  5%|▌         | 34/631 [07:38<1:32:14,  9.27s/it]

[SKIP] No English entries for image_id: 81f2saaeH0L. Found language tags: {"item_name": ["es_ES", "pt_PT"], "brand": ["es_ES", "pt_PT"], "bullet_point": ["es_ES", "es_ES", "es_ES", "es_ES", "es_ES", "pt_PT", "pt_PT", "pt_PT", "pt_PT", "pt_PT"]}


  7%|▋         | 45/631 [10:51<2:53:46, 17.79s/it]

[SKIP] No English entries for image_id: 81MTAAuGkpL. Found language tags: {"item_name": ["es_ES"], "brand": ["es_ES"], "bullet_point": ["es_ES"], "color": ["es_ES"], "style": ["es_ES"]}


  8%|▊         | 48/631 [10:54<1:17:03,  7.93s/it]

[SKIP] No English entries for image_id: 81nfajljhKL. Found language tags: {"item_name": ["sv_SE"], "brand": ["sv_SE"], "bullet_point": ["sv_SE", "sv_SE", "sv_SE", "sv_SE"], "color": ["sv_SE"]}


  8%|▊         | 53/631 [11:59<1:18:22,  8.14s/it]

[SKIP] No English entries for image_id: 81i5KiO116L. Found language tags: {"item_name": ["nl_NL"], "brand": ["nl_NL"], "bullet_point": ["nl_NL", "nl_NL", "nl_NL", "nl_NL", "nl_NL"]}


 11%|█         | 70/631 [16:18<1:26:07,  9.21s/it]

[SKIP] No English entries for image_id: 81UYKzpBqjL. Found language tags: {"item_name": ["fr_FR"], "brand": ["fr_FR"], "bullet_point": ["fr_FR", "fr_FR", "fr_FR", "fr_FR", "fr_FR", "fr_FR", "fr_FR", "fr_FR"], "style": ["fr_FR"]}
[SKIP] No English entries for image_id: 81gSlLdFSdL. Found language tags: {"item_name": ["ja_JP"], "brand": ["ja_JP"], "color": ["ja_JP"], "style": ["ja_JP"]}
[SKIP] No English entries for image_id: 81ULALRBCVL. Found language tags: {"item_name": ["sv_SE"], "brand": ["sv_SE"], "bullet_point": ["sv_SE", "sv_SE", "sv_SE", "sv_SE", "sv_SE"], "color": ["sv_SE"]}
[SKIP] No English entries for image_id: 81aWA3hS+vL. Found language tags: {"item_name": ["de_DE"], "brand": ["de_DE"], "bullet_point": ["de_DE", "de_DE", "de_DE", "de_DE", "de_DE", "de_DE", "de_DE", "de_DE", "de_DE", "de_DE"]}


 12%|█▏        | 76/631 [17:20<1:29:44,  9.70s/it]

[SKIP] No English entries for image_id: 81DCCQDxXGL. Found language tags: {"item_name": ["tr_TR"], "brand": ["tr_TR"], "bullet_point": ["tr_TR", "tr_TR", "tr_TR", "tr_TR"], "color": ["tr_TR"]}


 13%|█▎        | 81/631 [18:25<2:05:01, 13.64s/it]

[SKIP] No English entries for image_id: 81l9V8UkHbL. Found language tags: {"item_name": ["sv_SE"], "brand": ["sv_SE"], "style": ["sv_SE"]}


 16%|█▌        | 100/631 [22:55<2:01:42, 13.75s/it]


KeyboardInterrupt: 