In [1]:
import pandas as pd
import json
import glob
from tqdm import tqdm
import os
import google.generativeai as genai
from PIL import Image
import io
import time
import random
import re

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
images_df = pd.read_csv('../dataset/images.csv')
metadata_lookup = {}
json_files = sorted(glob.glob('../dataset/listings/metadata/listings_*.json'))

In [27]:
images_df = images_df.drop(columns=['height','width']) 

In [28]:
images_df

Unnamed: 0,image_id,path
0,010-mllS7JL,14/14fe8812.jpg
1,01dkn0Gyx0L,da/daab0cad.jpg
2,01sUPg0387L,d2/d2daaae9.jpg
3,1168jc-5r1L,3a/3a4e88e6.jpg
4,11RUV5Fs65L,d9/d91ab9cf.jpg
...,...,...
398207,B1zv8OpTkBS,6d/6d49d130.jpg
398208,B1zwflWhPIS,b1/b163e0ea.jpg
398209,C1lf45DhhRS,a1/a116d9d1.jpg
398210,C1pEt6jBLiS,9c/9c3e1158.jpg


In [30]:
def extract_field(data, key, inner_key='value'):
    if isinstance(data.get(key), list) and data[key]:
        first = data[key][0]
        if 'language_tag' in first and not first['language_tag'].startswith('en_'):
            return None
        return first.get(inner_key, None)
    return None

def extract_keywords(data):
    if isinstance(data.get('item_keywords'), list):
        keywords = [
            k['value'].strip().lower()
            for k in data['item_keywords']
            if 'language_tag' not in k or k['language_tag'].startswith('en_')
        ]
        seen = set()
        deduped_keywords = [k for k in keywords if not (k in seen or seen.add(k))]
        return ', '.join(deduped_keywords)
    return None

In [31]:
print("Building metadata lookup from JSON files...")
for file in tqdm(json_files, desc="Parsing listings JSONs"):
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                record = json.loads(line.strip())
                main_id = record.get('main_image_id', None)
                if main_id:
                    name = extract_field(record, 'item_name')
                    category = extract_field(record, 'product_type')
                    # brand = extract_field(record, 'brand')
                    color = extract_field(record, 'color')
                    keywords = extract_keywords(record)
                    metadata_lookup[main_id] = {
                        'name': name,
                        'category': category,
                        # 'brand': brand,
                        'color': color,
                        'keywords': keywords
                    }
            except json.JSONDecodeError:
                continue

Building metadata lookup from JSON files...


Parsing listings JSONs: 100%|██████████| 16/16 [00:12<00:00,  1.26it/s]


In [32]:
def get_metadata(image_id, field):
    data = metadata_lookup.get(image_id)
    if not data:
        return None
    return data.get(field, None)

In [33]:
def get_metadata(image_id, field):
    data = metadata_lookup.get(image_id)
    if not data:
        return None
    return data.get(field, None)

tqdm.pandas(desc="Matching image_ids")
images_df['name'] = images_df['image_id'].progress_apply(lambda x: get_metadata(x, 'name'))
images_df['category'] = images_df['image_id'].progress_apply(lambda x: get_metadata(x, 'category'))
# images_df['brand'] = images_df['image_id'].progress_apply(lambda x: get_metadata(x, 'brand'))
images_df['color'] = images_df['image_id'].progress_apply(lambda x: get_metadata(x, 'color'))
images_df['keywords'] = images_df['image_id'].progress_apply(lambda x: get_metadata(x, 'keywords'))

def is_ascii(text):
    try:
        if pd.isnull(text):
            return False
        text.encode('ascii')
        return True
    except UnicodeEncodeError:
        return False


Matching image_ids: 100%|██████████| 398212/398212 [00:01<00:00, 363904.87it/s]
Matching image_ids: 100%|██████████| 398212/398212 [00:01<00:00, 320664.39it/s]
Matching image_ids: 100%|██████████| 398212/398212 [00:00<00:00, 405558.52it/s]
Matching image_ids: 100%|██████████| 398212/398212 [00:00<00:00, 820864.58it/s]


In [36]:

print("Applying filters...")
images_df.dropna(subset=['name', 'category', 'color'], inplace=True)
for col in ['name', 'category', 'color']:
    images_df = images_df[images_df[col].apply(is_ascii)]

# images_df.to_csv('data/cleaned.csv', index=False)
print(f"Saved final filtered dataset to cleaned.csv")

Applying filters...
Saved final filtered dataset to cleaned.csv


In [38]:
images_df

Unnamed: 0,image_id,path,name,category,color,keywords
656,21QA1aaQcsL,1e/1e00bd6c.jpg,AmazonBasics DisplayPort to HDMI A/M cable - 3...,COMPUTER_ADD_ON,Black,"display port to hdmi, displayport to hdmi, hdm..."
712,21SgHQ5qBvL,77/77398330.jpg,AmazonBasics Translucent Protective TPU Plasti...,PHONE_ACCESSORY,Transparent,iphone x case xs 10 cases cover aquapac phone ...
1016,21hsHbAs2mL,79/799ef10c.jpg,Rhodium Plated Sterling Silver Lever Back Dang...,FINEEARRING,White,"classics with a twist, aniversary, anniversary..."
1167,21oIOJ2iaUL,ac/ac63688e.jpg,Eono by Amazon - Weight vest,SPORTING_GOODS,Viola nero-10kg,
1497,31-JG0PmbgL,dc/dc0da6c0.jpg,"AmazonBasics Microfiber Flat Sheet, 275 x 275 ...",FLAT_SHEET,Dark Grey,size sheets white black kitchen double global ...
...,...,...,...,...,...,...
398120,B1vtcrAQ91S,34/3410e4b6.jpg,"Light Grey Swatch, Ravenna Home",SOFA,Light Grey,
398125,B1w129exaHS,59/59129c2b.jpg,"Rivet Mustard Swatch, Rivet",OTTOMAN,Mustard,"furniture, ethan allen, flamant, ethan allen f..."
398190,B1zG+-uzYGS,18/18d3d6a8.jpg,"Silve Swatch, Stone & Beam",SWATCH,Silve,"farmhouse, and"
398198,B1zaWoxtyUS,e1/e1d35469.jpg,"Stone & Beam Emerald Swatch, Stone & Beam",SOFA,Emerald,"small, keepsake, for, corks, bottle, cork, tac..."


In [None]:
# import os
# import shutil
# import pandas as pd


# def copy_images(
#     df: pd.DataFrame,
#     dest_dir: str,
#     path_column: str = 'path',
#     filename_column: str = 'filename',
#     path_prefix: str = ''
# ) -> pd.DataFrame:

#     # Create destination directory if it doesn't exist
#     os.makedirs(dest_dir, exist_ok=True)

#     # Initialize the filename column
#     df[filename_column] = None

#     for idx, row in df.iterrows():
#         # Build full source path by applying prefix
#         original_path = row[path_column]
#         src_path = os.path.join(path_prefix, original_path) if path_prefix else original_path

#         # Check if source file exists
#         if not os.path.isfile(src_path):
#             print(f"[WARNING] File not found: {src_path}")
#             continue

#         # Extract filename and set destination path
#         filename = os.path.basename(src_path)
#         dest_path = os.path.join(dest_dir, filename)

#         try:
#             # Copy file metadata as well
#             shutil.copy2(src_path, dest_path)
#             # Record filename in DataFrame
#             df.at[idx, filename_column] = filename
#             print(f"[INFO] Copied: {src_path} -> {dest_path}")
#         except Exception as e:
#             print(f"[ERROR] Could not copy {src_path}: {e}")

#     return df

# destination_folder = '..data/images'
# prefix = '../dataset/small'
# images_df = copy_images(
#     images_df,
#     dest_dir=destination_folder,
#     path_column='path',
#     filename_column='filename',
#     path_prefix=prefix
# )



[INFO] Copied: ../dataset/small/1e/1e00bd6c.jpg -> ..data/images/1e00bd6c.jpg
[INFO] Copied: ../dataset/small/77/77398330.jpg -> ..data/images/77398330.jpg
[INFO] Copied: ../dataset/small/79/799ef10c.jpg -> ..data/images/799ef10c.jpg
[INFO] Copied: ../dataset/small/ac/ac63688e.jpg -> ..data/images/ac63688e.jpg
[INFO] Copied: ../dataset/small/dc/dc0da6c0.jpg -> ..data/images/dc0da6c0.jpg
[INFO] Copied: ../dataset/small/49/4962a4a8.jpg -> ..data/images/4962a4a8.jpg
[INFO] Copied: ../dataset/small/6c/6c9462cc.jpg -> ..data/images/6c9462cc.jpg
[INFO] Copied: ../dataset/small/1c/1c84195e.jpg -> ..data/images/1c84195e.jpg
[INFO] Copied: ../dataset/small/b6/b633c0a4.jpg -> ..data/images/b633c0a4.jpg
[INFO] Copied: ../dataset/small/ef/ef8a9c69.jpg -> ..data/images/ef8a9c69.jpg
[INFO] Copied: ../dataset/small/2b/2b2b955a.jpg -> ..data/images/2b2b955a.jpg
[INFO] Copied: ../dataset/small/b9/b9318f56.jpg -> ..data/images/b9318f56.jpg
[INFO] Copied: ../dataset/small/ca/cac7e26c.jpg -> ..data/images

In [43]:
images_df.to_csv("../data/cleaned.csv")

In [75]:
newdf = pd.read_csv("../data/qna_2.csv")

In [76]:
newdf

Unnamed: 0,path,question,answer
0,00/0004bef1.jpg,"Considering the bed's base, what single design...",brackets
1,00/0004bef1.jpg,What textural characteristic best describes th...,smooth
2,00/0004bef1.jpg,"Focusing on the bed's feet, what geometric sha...",rectangular
3,00/0004bef1.jpg,"Observing the overall aesthetic, what single w...",minimalist
4,00/0004bef1.jpg,What is the most prominent visual element sugg...,grain
...,...,...,...
25735,ff/ffe16ab8.jpg,"Analyzing the embellishments near the vamp, wh...",metallic
25736,ff/ffe16ab8.jpg,Considering the subtle textural variations acr...,matte
25737,ff/ffe16ab8.jpg,"Focusing on the insole's visible portion, what...",beige
25738,ff/ffe16ab8.jpg,"Observing the toe-cap's structural design, wha...",rounded


In [77]:
import os

# Method 1: using os.path.basename
newdf['filename'] = newdf['path'].apply(lambda p: os.path.basename(p) if isinstance(p, str) else p)


In [60]:
newdf.rename(columns={'image_path': 'filename'}, inplace=True)


In [78]:
newdf.drop(columns=['path'])

Unnamed: 0,question,answer,filename
0,"Considering the bed's base, what single design...",brackets,0004bef1.jpg
1,What textural characteristic best describes th...,smooth,0004bef1.jpg
2,"Focusing on the bed's feet, what geometric sha...",rectangular,0004bef1.jpg
3,"Observing the overall aesthetic, what single w...",minimalist,0004bef1.jpg
4,What is the most prominent visual element sugg...,grain,0004bef1.jpg
...,...,...,...
25735,"Analyzing the embellishments near the vamp, wh...",metallic,ffe16ab8.jpg
25736,Considering the subtle textural variations acr...,matte,ffe16ab8.jpg
25737,"Focusing on the insole's visible portion, what...",beige,ffe16ab8.jpg
25738,"Observing the toe-cap's structural design, wha...",rounded,ffe16ab8.jpg


In [79]:
newdf.to_csv("../data/qna_2.csv")

In [11]:
# GOOGLE_API_KEY = "AIzaSyAwziGqosd-np08NVMj6gEUylCS0Yc3kyE" #ananthakk26@gmail.com = My First Project (312)
# GOOGLE_API_KEY = "AIzaSyDDpeIrrCT-6uzFmyhIiVIHX3mXRUsTBNQ"  #zebramee1@gmail.com = vr-1 (1397)
# GOOGLE_API_KEY = "AIzaSyCSuPjNooXxuFmfwRMBG9KZH5tJARSqc5Y"  #zebramee1@gmail.com = vr-2 (1450)
# GOOGLE_API_KEY = "AIzaSyA0UhYe9svadSKOd0niUcMV86cCfynDGsI"  #zebramee1@gmail.com = vr-3 (1480)
# GOOGLE_API_KEY = "AIzaSyCB0PmIHRxMQn_y9-L0uBonlEfEJ9ojUwc"  #zebramee1@gmail.com = vr-4 (1470)
# GOOGLE_API_KEY = "AIzaSyDi3ylfG9Fnpbcf5nq8D9L4V1xwrGAymew"  #zebramee1@gmail.com = vr-5 (done)
# GOOGLE_API_KEY = "AIzaSyB7nGzvNvfEAShgxa1X6ywsyO1b7xeq54s"  #zebramee1@gmail.com = vr-6 (1490)
GOOGLE_API_KEY = "AIzaSyBoIpoLyA8VWVGcMvYSEwk740ZEGj01aIE"  #zebramee1@gmail.com = vr-7 (0)
# GOOGLE_API_KEY = "AIzaSyAhMC1sVMVEQJ4EUKLyWAC0y_2k6iTkV5Q"  #zebramee1@gmail.com = vr-8 (0)

In [13]:
genai.configure(api_key=GOOGLE_API_KEY)
generation_config = {
  "temperature": 0.5, # Lower temperature for more factual/constrained answers
  "top_p": 1,
  "top_k": 32,
  "max_output_tokens": 2000, # Sufficient for a Q&A pair
}

model = genai.GenerativeModel(model_name="gemini-1.5-flash",generation_config=generation_config,)
IMAGE_SUBDIRECTORY = "dataset/small" 

In [14]:
# --- Function to generate MULTIPLE Q&A pairs in ONE call ---
# Takes the FULL path to the image now
def generate_multiple_qa_for_image(full_image_path, metadata_without_brand):
    """
    Asks Gemini to generate 5 distinct Q&A pairs for an image in a single API call.

    Args:
        full_image_path (str): The complete path to the image file (including subdirectory).
        metadata_without_brand (dict): Dictionary containing metadata (excluding 'brand').

    Returns:
        list: A list of tuples, where each tuple is (question, answer).
              Returns an empty list if generation or parsing fails.
    """
    generated_pairs = []
    try:
        # Use the full_image_path passed to the function
        img = Image.open(full_image_path)
    except FileNotFoundError:
        # This error message now shows the full path it tried
        print(f"Error: Image file not found at {full_image_path}")
        return generated_pairs
    except Exception as e:
        print(f"Error opening image {full_image_path}: {e}")
        return generated_pairs

    # --- Prompt (remains the same) ---
    prompt_parts = [
        "Context about the image, make sure to also look at the image and analyse it before generating :",
        f"- Name: {metadata_without_brand.get('name', 'N/A')}",
        f"- Category: {metadata_without_brand.get('category', 'N/A')}",
        f"- Main Color Provided: {metadata_without_brand.get('color', 'N/A')}",
        f"- Keywords: {metadata_without_brand.get('keywords', 'N/A')}\n",
        "Instructions:",
        "1. Analyze the provided image and the context.",
        "2. Generate exactly 5 (five) distinct questions about prominent visual features, objects, colors, materials, or attributes clearly visible in the image.",
        "3. Each question MUST have a single-word answer directly verifiable from the image.",
        "4. The 5 questions generated MUST be different from each other, and make sure that the questions are answerable just by looking at the image for exmaple do nt ask questions like brand, if it there in the metadata but not in the image, and make the questions EASY",
        "5. Provide the output strictly in the following numbered format, with each question and answer pair clearly marked. Do not include any other text before or after this numbered list:\n",
        "6. Remember to make the questions easy and answerable just by loking at the image, like color shape etc",
        """1.
Question 1: [Your first question here]
Answer 1: [Your single-word answer here]
2.
Question 2: [Your second question here]
Answer 2: [Your single-word answer here]
3.
Question 3: [Your third question here]
Answer 3: [Your single-word answer here]
4.
Question 4: [Your fourth question here]
Answer 4: [Your single-word answer here]
5.
Question 5: [Your fifth question here]
Answer 5: [Your single-word answer here]""",
        "\nImage:",
        img,
    ]

    try:
        time.sleep(2.55)
        # Pass the full_image_path in the print statement for clarity
        # print(f"  Sending prompt to Gemini for image: {full_image_path}")
        response = model.generate_content(prompt_parts)
        response_text = response.text.strip()
        # print(f"  Received response text (length: {len(response_text)} chars).")

        # --- Parsing the response (remains the same) ---
        pattern = re.compile(
            r"^\s*\d+\.\s*\nQuestion\s*\d*:\s*(.*?)\s*\nAnswer\s*\d*:\s*(\w+)",
            re.MULTILINE | re.IGNORECASE
        )
        matches = pattern.findall(response_text)

        if matches:
            # print(f"  Successfully parsed {len(matches)} Q&A pairs.")
            for q, a in matches:
                question = q.strip().rstrip('?.!')
                answer = a.strip().lower()
                if question and answer:
                    generated_pairs.append((question, answer))
            if len(generated_pairs) < 5:
                 print(f"  Warning: Parsed fewer than 5 Q&A pairs ({len(generated_pairs)}).")
        else:
            print(f"  Warning: Could not parse Q&A pairs using regex. Check response format.")
            print(f"  Response Text was:\n{response_text[:500]}...")

    except Exception as e:
        # Pass the full_image_path in the error message
        print(f"Error during Gemini API call or processing for {full_image_path}: {e}")
        try:
            if response and response.prompt_feedback:
                print(f"Prompt Feedback: {response.prompt_feedback}")
            if response and not response.candidates:
                 print("Warning: Response has no candidates.")
            elif response and response.candidates and not response.candidates[0].content:
                 print("Warning: Received empty response content.")
        except Exception as feedback_err:
            print(f"(Error retrieving feedback: {feedback_err})")

    return generated_pairs



In [15]:
# --- Main Logic ---
all_generated_qa_data = []

# --- Path Validation ---
# Filter using the SUBDIRECTORY
print(f"Validating image paths in '{IMAGE_SUBDIRECTORY}/'...")
original_count = len(images_df)
# Drop rows where path is NaN or None first
images_df_cleaned = images_df.dropna(subset=['path']).copy()
# Check existence by joining the subdirectory with the relative path
valid_images_df = images_df_cleaned[images_df_cleaned['path'].apply(
    lambda relative_path: os.path.exists(os.path.join(IMAGE_SUBDIRECTORY, relative_path))
)].copy() # Ensure it's a copy

print(f"Found {len(valid_images_df)} valid image paths out of {original_count} total rows.")

if len(valid_images_df) == 0:
    print(f"Error: No valid image paths found in '{IMAGE_SUBDIRECTORY}/' based on the 'path' column. Please check the paths and subdirectory name.")
    exit()

Validating image paths in 'dataset/small/'...
Found 76017 valid image paths out of 76017 total rows.


In [120]:
# done till 0,1...2849 in curated.csv 
# done till 0,1,...2958 in non_case.csv

In [16]:
df_to_process = valid_images_df.iloc[0:1]

# --- Loop through the SELECTED SLICE ---
processed_count = 0
for index, row in df_to_process.iterrows(): # Iterate over the slice
    image_id = row['image_id']
    relative_path = str(row['path']) # Ensure path is string

    # --- Construct the FULL path ---
    full_image_path = os.path.join(IMAGE_SUBDIRECTORY, relative_path)

    # Print progress relative to the slice being processed
    print(f"\nProcessing index {index} (Item {processed_count + 1}/{len(df_to_process)}) | Image ID: {image_id} | Path: {full_image_path}...")

    metadata_for_prompt = row.to_dict()
    metadata_for_prompt.pop('path', None)

    # --- Make ONE API call per image using the FULL path ---
    qa_pairs_for_image = generate_multiple_qa_for_image(full_image_path, metadata_for_prompt)

    if qa_pairs_for_image:
        print(f"  -> Adding {len(qa_pairs_for_image)} generated Q&A pairs for {image_id}.")
        # Add each generated pair as a separate entry
        for i, (question, answer) in enumerate(qa_pairs_for_image):
            all_generated_qa_data.append({
                'image_id': image_id,
                'image_path': full_image_path,
                'question': question,
                'answer': answer
            })
    else:
        print(f"  -> Failed to generate/parse any Q&A pairs for {image_id}.")
        break

    processed_count += 1



Processing index 656 (Item 1/1) | Image ID: 21QA1aaQcsL | Path: dataset/small/1e/1e00bd6c.jpg...
  -> Adding 5 generated Q&A pairs for 21QA1aaQcsL.


In [None]:
# --- Processing Finished ---
print(f"\n--- Finished processing all images ---")
print(f"Generated a total of {len(all_generated_qa_data)} Q&A pairs.")

# --- Create the final dataset ---
if all_generated_qa_data:
    qa_results_df = pd.DataFrame(all_generated_qa_data)
    print("\nGenerated Q&A Dataset:")
    with pd.option_context('display.max_rows', 20, 'display.max_colwidth', 80):
        print(qa_results_df)

    try:
        qa_results_df.to_csv("data/qna.csv", index=False)
        print("\nResults saved to qna.csv")
    except Exception as e:
        print(f"\nError saving results to CSV: {e}")
else:
    print("\nNo Q&A pairs were successfully generated.")


--- Finished processing all images ---
Generated a total of 5 Q&A pairs.

Generated Q&A Dataset:
      image_id                     image_path  \
0  21QA1aaQcsL  dataset/small/1e/1e00bd6c.jpg   
1  21QA1aaQcsL  dataset/small/1e/1e00bd6c.jpg   
2  21QA1aaQcsL  dataset/small/1e/1e00bd6c.jpg   
3  21QA1aaQcsL  dataset/small/1e/1e00bd6c.jpg   
4  21QA1aaQcsL  dataset/small/1e/1e00bd6c.jpg   

                                 question       answer  
0     What is the main color of the cable        black  
1           What shape are the connectors  rectangular  
2           How many connectors are there          two  
3   What type of connector is on the left         hdmi  
4  What type of connector is on the right  displayport  

Results saved to qna.csv
