In [1]:
""" Pip Commands """
!pip install google-generativeai

Collecting google-generativeai
  Downloading google_generativeai-0.8.5-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.15 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.15-py3-none-any.whl.metadata (5.7 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.24.2-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.169.0-py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.39.0-py2.py3-none-any.whl.metadata (6.2 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.25.0rc0-py3-none-any.whl.metadata (3.0 kB)
Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-ai-generativelanguage==0.6.15->google-generativeai)
  Downloading proto_plus-1.26.1-py3-none-any.whl.metadata (2.2 kB)
Collecting protobuf (from google

In [1]:
""" Imports """
import pandas as pd
import random
import os
from PIL import Image
import json
import glob
import google.generativeai as genai
import time

In [6]:
""" Sample API Call """
# Set your Gemini API key
api_key = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
genai.configure(api_key=api_key)

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Sample a random row
sample = merged_df.sample(1).iloc[0]
image_id = sample["image_id"]
image_dir = "Datasets/small"
image_path = os.path.join(image_dir, sample["path"])
item_id = sample["item_id"]

# Load the image
image = Image.open(image_path)

# Search all JSON files for metadata of the sampled item_id
metadata = None
for json_file in glob.glob("Datasets/listings_*.json"):
    with open(json_file, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            if doc.get("item_id") == item_id:
                metadata = doc
                break
    if metadata:
        break

if metadata is None:
    raise ValueError(f"Metadata for item_id {item_id} not found.")

# Prepare prompt for question and single-word answer generation
prompt = f"""Based on the provided product image and metadata, generate:
1. A specific question about this product that can be answered with a single word
2. The single word answer to that question

Format your response exactly like this:
Question: [your question here]
Answer: [single word answer]

Metadata: {json.dumps(metadata, indent=2)}"""

# Gemini API call
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content(
    [prompt, image],
    stream=False
)

# Show response
print(f"Image ID: {image_id}")
print(f"Item ID: {item_id}")
print(f"Image Path: {image_path}")
print("Gemini Response:")
print(response.text)

# Extract question and answer if needed for further processing
response_text = response.text
question = None
answer = None

for line in response_text.split("\n"):
    if line.startswith("Question:"):
        question = line[len("Question:"):].strip()
    elif line.startswith("Answer:"):
        answer = line[len("Answer:"):].strip()

if question and answer:
    print("\nExtracted:")
    print(f"Question: {question}")
    print(f"Answer: {answer}")
    
    # Create a dictionary with the data to save
    data = {
        'Image_ID': [image_id],
        'Item_ID': [item_id],
        'Question': [question],
        'Answer': [answer]
    }
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Check if file exists
    file_exists = os.path.isfile('qna.csv')
    
    # Save to CSV (append if file exists, create with header if it doesn't)
    if file_exists:
        df.to_csv('qna.csv', mode='a', header=False, index=False)
        print("Data appended to qna.csv")
    else:
        df.to_csv('qna.csv', index=False)
        print("Created qna.csv with the data")

Image ID: 81mEuveXFVL
Item ID: B07PW6XQY8
Image Path: Datasets/small/5a/5a1fe2c8.jpg
Gemini Response:
Question: What is the bed's finish?
Answer: Wenge

Extracted:
Question: What is the bed's finish?
Answer: Wenge
Created qna.csv with the data


In [2]:
# Set your Gemini API key
api_key = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
genai.configure(api_key=api_key)

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Sample a random row
sample = merged_df.sample(1).iloc[0]
image_id = sample["image_id"]
item_id = sample["item_id"]

# Search all JSON files for metadata of the sampled item_id
metadata = None
for json_file in glob.glob("Datasets/listings_*.json"):
    with open(json_file, "r", encoding="utf-8") as f:
        for line in f:
            doc = json.loads(line)
            if doc.get("item_id") == item_id:
                metadata = doc
                break
    if metadata:
        break

if metadata is None:
    raise ValueError(f"Metadata for item_id {item_id} not found.")

# Print metadata
print("Metadata:")
print(json.dumps(metadata, indent=2))

# Prepare prompt
prompt = f"""You are given the metadata of a product listed online. Based only on this metadata, generate:
1. A specific question that a user might ask about this product, which can be answered with a single word
2. The single-word answer to that question

Format:
Question: [your question here]
Answer: [single word answer]

Metadata: {json.dumps(metadata, indent=2)}"""

# Gemini API call
model = genai.GenerativeModel("gemini-1.5-flash")
response = model.generate_content(prompt, stream=False)

# Print full API output
print("\nAPI Output:")
print(response.text)

# Extract question and answer
response_text = response.text
question = None
answer = None

for line in response_text.split("\n"):
    if line.startswith("Question:"):
        question = line[len("Question:"):].strip()
    elif line.startswith("Answer:"):
        answer = line[len("Answer:"):].strip()

# Print extracted question and answer
print("\nExtracted:")
print(f"Question: {question}")
print(f"Answer: {answer}")

Metadata:
{
  "item_dimensions": {
    "height": {
      "normalized_value": {
        "unit": "inches",
        "value": 4.5
      },
      "unit": "inches",
      "value": 4.5
    },
    "length": {
      "normalized_value": {
        "unit": "inches",
        "value": 2.375
      },
      "unit": "inches",
      "value": 2.375
    },
    "width": {
      "normalized_value": {
        "unit": "inches",
        "value": 2.375
      },
      "unit": "inches",
      "value": 2.375
    }
  },
  "brand": [
    {
      "language_tag": "en_SG",
      "value": "Amazon Elements"
    }
  ],
  "bullet_point": [
    {
      "language_tag": "en_SG",
    },
    {
      "language_tag": "en_SG",
      "value": "500mg turmeric root extract per serving"
    },
    {
      "language_tag": "en_SG",
      "value": "65 capsules, a 2 month supply (taken daily at listed serving size)"
    },
    {
      "language_tag": "en_SG",
      "value": "No artificial colors, flavors or chemical preservatives"
    },


In [3]:
""" Main API Call with Start Index Support """

# Set your Gemini API key
api_key = "AIzaSyC70OLGkkGuIwHnMzjPTOt2Bd44MHu9tAA"
genai.configure(api_key=api_key)

# Define the starting index (can be changed before running)
start_index = 3000  # Change this value to continue from where you left off

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Filter to get one image per product, preferring "main" role
unique_products_df = pd.DataFrame()

# Group by item_id
grouped = merged_df.groupby("item_id")

# For each product, get the "main" image or the first one if no main exists
for item_id, group in grouped:
    main_images = group[group["role"] == "main"]
    if len(main_images) > 0:
        # Take the first main image
        unique_products_df = pd.concat([unique_products_df, main_images.iloc[[0]]])
    else:
        # If no main image, take the first image
        unique_products_df = pd.concat([unique_products_df, group.iloc[[0]]])

# Reset index
unique_products_df = unique_products_df.reset_index(drop=True)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")

# Create counter for progress tracking
total_products = len(unique_products_df)
processed_count = 0
error_count = 0
max_api_calls = 1000  # Limit to 1000 API calls

print(f"Found {total_products} unique products")
print(f"Starting processing from index {start_index} up to {max_api_calls} products...")

# Process each row in the filtered DataFrame, starting from the specified index
for index, row in unique_products_df.iloc[start_index:].iterrows():
    # Check if we've reached the API call limit
    if processed_count + error_count >= max_api_calls:
        print(f"\nReached limit of {max_api_calls} API calls.")
        print(f"Stopped at index {index}, Item_ID: {row['item_id']}")
        print(f"To continue processing, set start_index = {index}")
        break
        
    try:
        # Extract image information
        image_id = row["image_id"]
        image_dir = "Datasets/small"
        image_path = os.path.join(image_dir, row["path"])
        item_id = row["item_id"]
        
        # Load the image
        image = Image.open(image_path)
        
        # Search all JSON files for metadata of the item_id
        metadata = None
        for json_file in glob.glob("Datasets/listings_*.json"):
            with open(json_file, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    if doc.get("item_id") == item_id:
                        metadata = doc
                        break
            if metadata:
                break
        
        if metadata is None:
            print(f"Warning: Metadata for item_id {item_id} not found. Skipping.")
            error_count += 1
            continue
        
        # Prepare prompt for question and single-word answer generation
        prompt = f"""Based on the provided product image and metadata, generate:
1. A specific question about this product that can be answered with a single word
2. The single word answer to that question

Format your response exactly like this:
Question: [your question here]
Answer: [single word answer]

Metadata: {json.dumps(metadata, indent=2)}"""
        
        # Gemini API call
        response = model.generate_content(
            [prompt, image],
            stream=False
        )
        
        # Extract question and answer
        response_text = response.text
        question = None
        answer = None
        
        for line in response_text.split("\n"):
            if line.startswith("Question:"):
                question = line[len("Question:"):].strip()
            elif line.startswith("Answer:"):
                answer = line[len("Answer:"):].strip()
        
        if question and answer:
            # Create a dictionary with the data to save
            data = {
                'Image_ID': [image_id],
                'Item_ID': [item_id],
                'Question': [question],
                'Answer': [answer]
            }
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Check if file exists
            file_exists = os.path.isfile('qna.csv')
            
            # Save to CSV (append if file exists, create with header if it doesn't)
            if file_exists:
                df.to_csv('qna.csv', mode='a', header=False, index=False)
            else:
                df.to_csv('qna.csv', index=False)
        
        # Update progress
        processed_count += 1
        if processed_count % 10 == 0:
            print(f"Processed {processed_count}/{max_api_calls} products")
        
        # Short summary of the current item
        print(f"Processed: Item_ID: {item_id} | Image_ID: {image_id} | Question: {question} | Answer: {answer}")
        
        # Pause between API calls to avoid rate limits
        time.sleep(1.0)  # 1 second pause between calls
        
    except Exception as e:
        error_count += 1
        print(f"Error processing row {index}: {str(e)}")
        # Continue with the next image even if there's an error
        continue

print(f"\nProcessing complete!")
print(f"Total products processed: {processed_count}")
print(f"Errors: {error_count}")
print(f"Results saved to qna.csv")
print(f"Next start_index to continue: {start_index + processed_count + error_count}")

Found 4923 unique products
Starting processing from index 3000 up to 1000 products...
Processed: Item_ID: B07TH3BQLV | Image_ID: 71w29SzrRFL | Question: Is this case 3D printed? | Answer: Yes
Processed: Item_ID: B07TH3BQM3 | Image_ID: 713qycYoerL | Question: Is this case for a Vivo Y53? | Answer: Yes
Processed: Item_ID: B07TH3BQPL | Image_ID: 71ghdUA-qtL | Question: Is this case 3D printed? | Answer: Yes
Error processing row 3003: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/f4/f46657c5.jpg'
Error processing row 3004: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/ee/ee8ee952.jpg'
Error processing row 3005: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/f7/f7d3b8b5.jpg'
Processed: Item_ID: B07TH3CC4Y | Image_ID: 71rMACNyHtL | Question: Is this case 3D printed? | Answer: Yes
Processed: Item_ID: B07TH3CC5J | Image_ID: 71BbPKf4xIL | Question: Is this case for 

KeyboardInterrupt: 

In [5]:
""" API Keys """
key_1 = "AIzaSyDRYg1bgxG-Yv77yqV5RmJuRFAsplOyLg0"
key_2 = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
key_3 = "AIzaSyC8YBijcN_xFfST2Y4KB3YVR7DNnI-FFz8"

In [6]:
""" API call with refined instruction prompt and filtered json metadata """

import time
import json
import os
import glob
import pandas as pd
from PIL import Image
import google.generativeai as genai

# Set your Gemini API key
api_key = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
genai.configure(api_key=api_key)

# Define the starting index (can be changed before running)
start_index = 2500  # Change this value to continue from where you left off

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Filter to get one image per product, preferring "main" role
unique_products_df = pd.DataFrame()

# Group by item_id
grouped = merged_df.groupby("item_id")

# For each product, get the "main" image or the first one if no main exists
for item_id, group in grouped:
    main_images = group[group["role"] == "main"]
    if len(main_images) > 0:
        # Take the first main image
        unique_products_df = pd.concat([unique_products_df, main_images.iloc[[0]]])
    else:
        # If no main image, take the first image
        unique_products_df = pd.concat([unique_products_df, group.iloc[[0]]])

# Reset index
unique_products_df = unique_products_df.reset_index(drop=True)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")

# Create counter for progress tracking
total_products = len(unique_products_df)
processed_count = 0
error_count = 0
max_api_calls = 500  # Limit to 1000 API calls

print(f"Found {total_products} unique products")
print(f"Starting processing from index {start_index} up to {max_api_calls} products...")

# Process each row in the filtered DataFrame, starting from the specified index
for index, row in unique_products_df.iloc[start_index:].iterrows():
    # Check if we've reached the API call limit
    if processed_count + error_count >= max_api_calls:
        print(f"\nReached limit of {max_api_calls} API calls.")
        print(f"Stopped at index {index}, Item_ID: {row['item_id']}")
        print(f"To continue processing, set start_index = {index}")
        break
        
    try:
        # Extract image information
        image_id = row["image_id"]
        image_dir = "Datasets/small"
        image_path = os.path.join(image_dir, row["path"])
        item_id = row["item_id"]
        
        # Load the image
        image = Image.open(image_path)
        
        # Search all JSON files for metadata of the item_id
        metadata = None
        for json_file in glob.glob("Datasets/listings_*.json"):
            with open(json_file, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    if doc.get("item_id") == item_id:
                        metadata = doc
                        break
            if metadata:
                break
        
        if metadata is None:
            print(f"Warning: Metadata for item_id {item_id} not found. Skipping.")
            error_count += 1
            continue
        
        # Filter metadata to include only specified fields
        filtered_metadata = {}
        for field in ["product_type", "style", "item_keywords", "bullet_point", "color"]:
            if field in metadata:
                filtered_metadata[field] = metadata[field]
        
        # Prepare prompt for customer question and service rep answer generation
        prompt = f"""Based on the provided product image and metadata, the objective is to generate:
1. A specific question that a customer might ask about this product that can be answered with a single word
2. A concise, professional single-word answer that a customer service representative would give

The question should be realistic - something a typical online shopper might ask about this specific product.
The answer should be professional, helpful, and accurate based on the product details.

Format your response exactly like this:
Question: [your customer question here]
Answer: [single word answer from customer service]

Metadata: {json.dumps(filtered_metadata, indent=2)}"""
        
        # Gemini API call
        response = model.generate_content(
            [prompt, image],
            stream=False
        )
        
        # Extract question and answer
        response_text = response.text
        question = None
        answer = None
        
        for line in response_text.split("\n"):
            if line.startswith("Question:"):
                question = line[len("Question:"):].strip()
            elif line.startswith("Answer:"):
                answer = line[len("Answer:"):].strip()
        
        if question and answer:
            # Create a dictionary with the data to save
            data = {
                'Image_ID': [image_id],
                'Item_ID': [item_id],
                'Question': [question],
                'Answer': [answer]
            }
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Check if file exists
            file_exists = os.path.isfile('qna.csv')
            
            # Save to CSV (append if file exists, create with header if it doesn't)
            if file_exists:
                df.to_csv('qna2.csv', mode='a', header=False, index=False)
            else:
                df.to_csv('qna2.csv', index=False)
        
        # Update progress
        processed_count += 1
        if processed_count % 10 == 0:
            print(f"Processed {processed_count}/{max_api_calls} products")
        
        # Short summary of the current item
        print(f"Processed: Item_ID: {item_id} | Image_ID: {image_id}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print("-" * 50)
        
        # Pause between API calls to avoid rate limits
        time.sleep(1.0)  # 1 second pause between calls
        
    except Exception as e:
        error_count += 1
        print(f"Error processing row {index}: {str(e)}")
        # Continue with the next image even if there's an error
        continue

print(f"\nProcessing complete!")
print(f"Total products processed: {processed_count}")
print(f"Errors: {error_count}")
print(f"Results saved to qna2.csv")
print(f"Next start_index to continue: {start_index + processed_count + error_count}")

Found 4923 unique products
Starting processing from index 2500 up to 500 products...
Error processing row 2500: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/ee/ee8ee952.jpg'
Processed: Item_ID: B07TCWSQTQ | Image_ID: 81vp+RIx+SL
Q: Is this case compatible with the LG Stylus 2?
A: Yes
--------------------------------------------------
Processed: Item_ID: B07TCWSTTP | Image_ID: 61EMcRL3paL
Q: Is this case compatible with the Samsung Galaxy J4 Core?
A: Yes
--------------------------------------------------
Processed: Item_ID: B07TCWSWP1 | Image_ID: 81MPdjCfBKL
Q: Is this case compatible with the Samsung Galaxy C7 Pro?
A: Yes
--------------------------------------------------
Processed: Item_ID: B07TCWSWYN | Image_ID: 71L4ynn0A1L
Q: Is this case compatible with the Samsung Galaxy J6 Plus?
A: Yes
--------------------------------------------------
Processed: Item_ID: B07TCWSXHQ | Image_ID: 817-R3GwBJL
Q: Is this case hard or soft?
A: Hard
--------

In [2]:
""" API Keys """
key_1 = "AIzaSyDRYg1bgxG-Yv77yqV5RmJuRFAsplOyLg0"
key_2 = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
key_3 = "AIzaSyC8YBijcN_xFfST2Y4KB3YVR7DNnI-FFz8"

In [10]:
""" API call with refined instruction prompt and complete json metadata """

import time
import json
import os
import glob
import pandas as pd
from PIL import Image
import google.generativeai as genai

# Set your Gemini API key
api_key = key_1
genai.configure(api_key=api_key)

# Define the starting index (can be changed before running)
start_index = 4000  # Change this value to continue from where you left off

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Filter to get one image per product, preferring "main" role
unique_products_df = pd.DataFrame()

# Group by item_id
grouped = merged_df.groupby("item_id")

# For each product, get the "main" image or the first one if no main exists
for item_id, group in grouped:
    main_images = group[group["role"] == "main"]
    if len(main_images) > 0:
        # Take the first main image
        unique_products_df = pd.concat([unique_products_df, main_images.iloc[[0]]])
    else:
        # If no main image, take the first image
        unique_products_df = pd.concat([unique_products_df, group.iloc[[0]]])

# Reset index
unique_products_df = unique_products_df.reset_index(drop=True)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")

# Create counter for progress tracking
total_products = len(unique_products_df)
processed_count = 0
error_count = 0
max_api_calls = 500  # Limit to 1000 API calls

print(f"Found {total_products} unique products")
print(f"Starting processing from index {start_index} up to {max_api_calls} products...")

# Process each row in the filtered DataFrame, starting from the specified index
for index, row in unique_products_df.iloc[start_index:].iterrows():
    # Check if we've reached the API call limit
    if processed_count + error_count >= max_api_calls:
        print(f"\nReached limit of {max_api_calls} API calls.")
        print(f"Stopped at index {index}, Item_ID: {row['item_id']}")
        print(f"To continue processing, set start_index = {index}")
        break
        
    try:
        # Extract image information
        image_id = row["image_id"]
        image_dir = "Datasets/small"
        image_path = os.path.join(image_dir, row["path"])
        item_id = row["item_id"]
        
        # Load the image
        image = Image.open(image_path)
        
        # Search all JSON files for metadata of the item_id
        metadata = None
        for json_file in glob.glob("Datasets/listings_*.json"):
            with open(json_file, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    if doc.get("item_id") == item_id:
                        metadata = doc
                        break
            if metadata:
                break
        
        if metadata is None:
            print(f"Warning: Metadata for item_id {item_id} not found. Skipping.")
            error_count += 1
            continue
        
        # Using the entire metadata instead of filtering it
        
        # Prepare prompt for customer question and service rep answer generation with improved few-shot examples
        prompt = f"""Based on the provided product image and metadata, your object is to generate:
1. A realistic, specific question that a customer might ask about THIS PARTICULAR product that can be answered with a single word
2. A concise, professional single-word answer that a customer service representative would give

IMPORTANT: 
- Your question MUST be specific to the product shown in the image - consider its category, appearance, and features
- DO NOT generate generic questions or repeat the same type of question for different products
- Vary your questions based on what's most relevant to the specific product category
- Focus on what a real customer would want to know about this specific item

Here are examples of the expected format across DIFFERENT product categories:

Example 1 (Electronics):
Question: Which Android version does this device support?
Answer: Android12

Example 2 (Furniture):
Question: What type of assembly does this table require?
Answer: Flatpack



Format your response exactly like this:
Question: [your customer question here that's specific to this product]
Answer: [single word answer from customer service]

Metadata: {json.dumps(metadata, indent=2)}"""
        
        # Gemini API call
        response = model.generate_content(
            [prompt, image],
            stream=False
        )
        
        # Extract question and answer
        response_text = response.text
        question = None
        answer = None
        
        for line in response_text.split("\n"):
            if line.startswith("Question:"):
                question = line[len("Question:"):].strip()
            elif line.startswith("Answer:"):
                answer = line[len("Answer:"):].strip()
        
        if question and answer:
            # Create a dictionary with the data to save
            data = {
                'Image_ID': [image_id],
                'Item_ID': [item_id],
                'Question': [question],
                'Answer': [answer]
            }
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Check if file exists
            file_exists = os.path.isfile('qna3.csv')
            
            # Save to CSV (append if file exists, create with header if it doesn't)
            if file_exists:
                df.to_csv('qna3.csv', mode='a', header=False, index=False)
            else:
                df.to_csv('qna3.csv', index=False)
        
        # Update progress
        processed_count += 1
        if processed_count % 10 == 0:
            print(f"Processed {processed_count}/{max_api_calls} products")
        
        # Short summary of the current item
        print(f"Processed: Item_ID: {item_id} | Image_ID: {image_id}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print("-" * 50)
        
        # Pause between API calls to avoid rate limits
        time.sleep(1.0)  # 1 second pause between calls
        
    except Exception as e:
        error_count += 1
        print(f"Error processing row {index}: {str(e)}")
        # Continue with the next image even if there's an error
        continue

print(f"\nProcessing complete!")
print(f"Total products processed: {processed_count}")
print(f"Errors: {error_count}")
print(f"Results saved to qna3.csv")
print(f"Next start_index to continue: {start_index + processed_count + error_count}")

Found 4923 unique products
Starting processing from index 4000 up to 500 products...
Processed: Item_ID: B0853WSD6V | Image_ID: 71uILyygfnL
Q: Is this case compatible with the Poco X2?
A: Yes
--------------------------------------------------
Processed: Item_ID: B0853WSD9Z | Image_ID: 717wEPIxQuL
Q: Is this OnePlus 7 case made of hard plastic?
A: Yes
--------------------------------------------------
Processed: Item_ID: B0853WSH1G | Image_ID: 71rlHMYuu6L
Q: Is this case made of silicone?
A: Yes
--------------------------------------------------
Processed: Item_ID: B0853WSSWH | Image_ID: 81oQG6AzhsL
Q: Is this case made of silicone?
A: Yes
--------------------------------------------------
Processed: Item_ID: B0853WT5NG | Image_ID: 714rpinTCIL
Q: Is this case compatible with a Gionee X1?
A: Yes
--------------------------------------------------


KeyboardInterrupt: 

In [8]:
""" API call with refined instruction prompt and complete json metadata using chain-of-thought """

import time
import json
import os
import glob
import pandas as pd
from PIL import Image
import google.generativeai as genai

# Set your Gemini API key
api_key = "AIzaSyC8YBijcN_xFfST2Y4KB3YVR7DNnI-FFz8"
genai.configure(api_key=api_key)

# Define the starting index (can be changed before running)
start_index = 3500  # Change this value to continue from where you left off

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Filter to get one image per product, preferring "main" role
unique_products_df = pd.DataFrame()

# Group by item_id
grouped = merged_df.groupby("item_id")

# For each product, get the "main" image or the first one if no main exists
for item_id, group in grouped:
    main_images = group[group["role"] == "main"]
    if len(main_images) > 0:
        # Take the first main image
        unique_products_df = pd.concat([unique_products_df, main_images.iloc[[0]]])
    else:
        # If no main image, take the first image
        unique_products_df = pd.concat([unique_products_df, group.iloc[[0]]])

# Reset index
unique_products_df = unique_products_df.reset_index(drop=True)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")

# Create counter for progress tracking
total_products = len(unique_products_df)
processed_count = 0
error_count = 0
max_api_calls = 1000  # Limit to 1000 API calls

print(f"Found {total_products} unique products")
print(f"Starting processing from index {start_index} up to {max_api_calls} products...")

# Process each row in the filtered DataFrame, starting from the specified index
for index, row in unique_products_df.iloc[start_index:].iterrows():
    # Check if we've reached the API call limit
    if processed_count + error_count >= max_api_calls:
        print(f"\nReached limit of {max_api_calls} API calls.")
        print(f"Stopped at index {index}, Item_ID: {row['item_id']}")
        print(f"To continue processing, set start_index = {index}")
        break
        
    try:
        # Extract image information
        image_id = row["image_id"]
        image_dir = "Datasets/small"
        image_path = os.path.join(image_dir, row["path"])
        item_id = row["item_id"]
        
        # Load the image
        image = Image.open(image_path)
        
        # Search all JSON files for metadata of the item_id
        metadata = None
        for json_file in glob.glob("Datasets/listings_*.json"):
            with open(json_file, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    if doc.get("item_id") == item_id:
                        metadata = doc
                        break
            if metadata:
                break
        
        if metadata is None:
            print(f"Warning: Metadata for item_id {item_id} not found. Skipping.")
            error_count += 1
            continue
        
        prompt = f"""Based on the provided product image and metadata, I want you to generate a specific customer question about this product that can be answered with a single descriptive word. The answer must describe an attribute, material, style, or feature of the product. Let's think through this step by step:

Step 1: Analyze what type of product this is.
Look carefully at the image and examine the metadata. What category does this product belong to? What are its key features, materials, and purpose?

Step 2: Consider what aspects customers typically care about for this product category.
Based on the product type identified in Step 1, what are the top 3-4 concerns or questions that customers typically have? Consider aspects like material, style, functionality, brand, origin, or special features.

Step 3: Determine which question would be most relevant and specific to THIS PARTICULAR product, avoiding generic Yes/No questions.
From the aspects identified in Step 2, choose the one that’s most important to a customer considering this specific product. Formulate a clear, specific question that can be answered with a single descriptive word (e.g., "What is the style of shoe?" or "What is the metal of this necklace?"). Avoid any Yes/No questions.

Step 4: Determine the most accurate, single-word descriptive answer based on what's visible in the image and described in the metadata.
Based on the product details available, provide the most precise answer to the question formulated in Step 3. Ensure the answer is a single word and specific to this product (e.g., "Moccasin" for shoe style, "Silver" for necklace material).

Step 5: Verify that your question and answer pair meets the following requirements:
- The question must not be answerable with Yes/No.
- The answer must be a **single word** describing an attribute or feature of the product (e.g., material, style, brand, origin).
- Avoid asking about phone case compatibility with specific models.

Based on this analysis, provide your final output in exactly this format:
Question: [your specific, relevant customer question]
Answer: [single-word descriptive answer]

        Metadata: {json.dumps(metadata, indent=2)}"""
        
        # Gemini API call
        response = model.generate_content(
            [prompt, image],
            stream=False
        )
        
        # Extract just the question and answer from the response
        response_text = response.text
        question = None
        answer = None
        
        # Process the response to extract just the final Q&A
        in_final_output = False
        for line in response_text.split("\n"):
            if "Question:" in line and "Answer:" not in line:  # Ensure we're not capturing "Question:" from the prompt
                question = line[line.find("Question:") + len("Question:"):].strip()
                in_final_output = True
            elif "Answer:" in line and in_final_output:
                answer = line[line.find("Answer:") + len("Answer:"):].strip()
                break
        
        # If direct extraction failed, try a more general approach
        if question is None or answer is None:
            for line in response_text.split("\n"):
                if line.startswith("Question:"):
                    question = line[len("Question:"):].strip()
                elif line.startswith("Answer:"):
                    answer = line[len("Answer:"):].strip()
        
        if question and answer:
            # Create a dictionary with the data to save
            data = {
                'Image_ID': [image_id],
                'Item_ID': [item_id],
                'Question': [question],
                'Answer': [answer]
            }
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Check if file exists
            file_exists = os.path.isfile('qna4.csv')
            
            # Save to CSV (append if file exists, create with header if it doesn't)
            if file_exists:
                df.to_csv('qna4.csv', mode='a', header=False, index=False)
            else:
                df.to_csv('qna4.csv', index=False)
        
        # Update progress
        processed_count += 1
        if processed_count % 10 == 0:
            print(f"Processed {processed_count}/{max_api_calls} products")
        
        # Short summary of the current item
        print(f"Processed: Item_ID: {item_id} | Image_ID: {image_id}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print("-" * 50)
        
        # Pause between API calls to avoid rate limits
        time.sleep(1.0)  # 1 second pause between calls
        
    except Exception as e:
        error_count += 1
        print(f"Error processing row {index}: {str(e)}")
        # Continue with the next image even if there's an error
        continue

print(f"\nProcessing complete!")
print(f"Total products processed: {processed_count}")
print(f"Errors: {error_count}")
print(f"Results saved to qna4.csv")
print(f"Next start_index to continue: {start_index + processed_count + error_count}")

Found 4923 unique products
Starting processing from index 3500 up to 1000 products...
Processed: Item_ID: B07ZKPZ5M9 | Image_ID: 817tVD0fJLL
Q: What is the scent of this Presto! cleaner?
A: Floral
--------------------------------------------------
Processed: Item_ID: B07ZKQ3MPG | Image_ID: 71LHp-xRq5L
Q: What is the scent of this cleaner?
A: Citrus
--------------------------------------------------
Error processing row 3502: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/eb/ebc3ed5b.jpg'
Processed: Item_ID: B07ZL1Q3PY | Image_ID: 61e-hSPKcjL
Q: What is the color of the AmazonBasics bungee cord?
A: Green
--------------------------------------------------
Processed: Item_ID: B07ZL1Q56Z | Image_ID: 517h6BqDeKL
Q: What is the color of the ratchet tie-down straps?
A: Yellow
--------------------------------------------------
Processed: Item_ID: B07ZL1ZYN8 | Image_ID: 61rPasB575L
Q: What is the material of these bungee cords?
A: Rubber
--------------

In [7]:
import pandas as pd

# Load the CSV file
file_path = "qna4.csv"
df = pd.read_csv(file_path)

# Remove the last 3 rows
df = df[:-3]

# Save the updated dataframe back to the CSV file
df.to_csv(file_path, index=False)

print("Last 3 lines have been deleted from qna4.csv.")

Last 3 lines have been deleted from qna4.csv.


In [8]:
""" API call with refined instruction prompt and complete json metadata using chain-of-thought """

import time
import json
import os
import glob
import pandas as pd
from PIL import Image
import google.generativeai as genai

# Set your Gemini API key
api_key = "AIzaSyC70OLGkkGuIwHnMzjPTOt2Bd44MHu9tAA"
genai.configure(api_key=api_key)

# Define the starting index (can be changed before running)
start_index = 0  # Change this value to continue from where you left off

# Load merged CSV
merged_path = "Datasets/filtered_merged_output.csv"
merged_df = pd.read_csv(merged_path)

# Filter to get one image per product, preferring "main" role
unique_products_df = pd.DataFrame()

# Group by item_id
grouped = merged_df.groupby("item_id")

# For each product, get the "main" image or the first one if no main exists
for item_id, group in grouped:
    main_images = group[group["role"] == "main"]
    if len(main_images) > 0:
        # Take the first main image
        unique_products_df = pd.concat([unique_products_df, main_images.iloc[[0]]])
    else:
        # If no main image, take the first image
        unique_products_df = pd.concat([unique_products_df, group.iloc[[0]]])

# Reset index
unique_products_df = unique_products_df.reset_index(drop=True)

# Initialize model
model = genai.GenerativeModel("gemini-1.5-flash")

# Create counter for progress tracking
total_products = len(unique_products_df)
processed_count = 0
error_count = 0
max_api_calls = 1000  # Limit to 1000 API calls

print(f"Found {total_products} unique products")
print(f"Starting processing from index {start_index} up to {max_api_calls} products...")

# Process each row in the filtered DataFrame, starting from the specified index
for index, row in unique_products_df.iloc[start_index:].iterrows():
    # Check if we've reached the API call limit
    if processed_count + error_count >= max_api_calls:
        print(f"\nReached limit of {max_api_calls} API calls.")
        print(f"Stopped at index {index}, Item_ID: {row['item_id']}")
        print(f"To continue processing, set start_index = {index}")
        break
        
    try:
        # Extract image information
        image_id = row["image_id"]
        image_dir = "Datasets/small"
        image_path = os.path.join(image_dir, row["path"])
        item_id = row["item_id"]
        
        # Load the image
        image = Image.open(image_path)
        
        # Search all JSON files for metadata of the item_id
        metadata = None
        for json_file in glob.glob("Datasets/listings_*.json"):
            with open(json_file, "r", encoding="utf-8") as f:
                for line in f:
                    doc = json.loads(line)
                    if doc.get("item_id") == item_id:
                        metadata = doc
                        break
            if metadata:
                break
        
        if metadata is None:
            print(f"Warning: Metadata for item_id {item_id} not found. Skipping.")
            error_count += 1
            continue
        
        # Chain of thought prompting
        prompt = f"""Based on the provided product image and metadata, I want you to generate a specific customer question about this product that can be answered with a single word, along with the appropriate answer. Let's think through this step by step:

Step 1: Analyze what type of product this is.
Look carefully at the image and examine the metadata. What category does this product belong to? What are its key features, materials, and purpose?

Step 2: Consider what aspects customers typically care about for this product category.
Based on the product type identified in Step 1, what are the top 3-4 concerns or questions that customers typically have? Consider aspects like material, functionality, compatibility, care instructions, or special features.

Step 3: Determine which question would be most relevant and specific to THIS PARTICULAR product.
From the aspects identified in Step 2, which one would be most important to a customer considering this specific product? Formulate a clear, specific question that can be answered with a single word (typically Yes or No).

Step 4: Determine the most accurate answer based on what's visible in the image and described in the metadata.
Based on the product details available, what is the correct single-word answer to the question formulated in Step 3?

Step 5: Verify that your question is NOT about phone case compatibility with specific models.

Based on this analysis, provide your final output in exactly this format:
Question: [your specific, relevant customer question]
Answer: [single-word answer]

Metadata: {json.dumps(metadata, indent=2)}"""
        
        # Gemini API call
        response = model.generate_content(
            [prompt, image],
            stream=False
        )
        
        # Extract just the question and answer from the response
        response_text = response.text
        question = None
        answer = None
        
        # Process the response to extract just the final Q&A
        in_final_output = False
        for line in response_text.split("\n"):
            if "Question:" in line and "Answer:" not in line:  # Ensure we're not capturing "Question:" from the prompt
                question = line[line.find("Question:") + len("Question:"):].strip()
                in_final_output = True
            elif "Answer:" in line and in_final_output:
                answer = line[line.find("Answer:") + len("Answer:"):].strip()
                break
        
        # If direct extraction failed, try a more general approach
        if question is None or answer is None:
            for line in response_text.split("\n"):
                if line.startswith("Question:"):
                    question = line[len("Question:"):].strip()
                elif line.startswith("Answer:"):
                    answer = line[len("Answer:"):].strip()
        
        if question and answer:
            # Create a dictionary with the data to save
            data = {
                'Image_ID': [image_id],
                'Item_ID': [item_id],
                'Question': [question],
                'Answer': [answer]
            }
            
            # Convert to DataFrame
            df = pd.DataFrame(data)
            
            # Check if file exists
            file_exists = os.path.isfile('qna4.csv')
            
            # Save to CSV (append if file exists, create with header if it doesn't)
            if file_exists:
                df.to_csv('qna4.csv', mode='a', header=False, index=False)
            else:
                df.to_csv('qna4.csv', index=False)
        
        # Update progress
        processed_count += 1
        if processed_count % 10 == 0:
            print(f"Processed {processed_count}/{max_api_calls} products")
        
        # Short summary of the current item
        print(f"Processed: Item_ID: {item_id} | Image_ID: {image_id}")
        print(f"Q: {question}")
        print(f"A: {answer}")
        print("-" * 50)
        
        # Pause between API calls to avoid rate limits
        time.sleep(1.0)  # 1 second pause between calls
        
    except Exception as e:
        error_count += 1
        print(f"Error processing row {index}: {str(e)}")
        # Continue with the next image even if there's an error
        continue

print(f"\nProcessing complete!")
print(f"Total products processed: {processed_count}")
print(f"Errors: {error_count}")
print(f"Results saved to qna4.csv")
print(f"Next start_index to continue: {start_index + processed_count + error_count}")

Found 4923 unique products
Starting processing from index 0 up to 1000 products...
Processed: Item_ID: B00004SD6V | Image_ID: 51JKXD3XTJL
Q: Is the trowel rustproof?
A: Yes
--------------------------------------------------
Processed: Item_ID: B00005041B | Image_ID: 41ZZFRX9TXL
Q: Are bulbs included?
A: Yes
--------------------------------------------------
Error processing row 2: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/ee/eeeea998.jpg'
Error processing row 3: [Errno 2] No such file or directory: '/home/saishruti/Research1/VR_Project/Datasets/small/fb/fbf7478f.jpg'
Processed: Item_ID: B000GXF4UC | Image_ID: 51jNv-Fzw6L
Q: Is this sun lounger foldable?
A: No
--------------------------------------------------
Processed: Item_ID: B000HBVO5W | Image_ID: 516VDAEVA4L
Q: Is this a set of two chairs?
A: Yes
--------------------------------------------------
Processed: Item_ID: B000HVGJMK | Image_ID: 413YDDWEAEL
Q: Is this table suitable for out

In [None]:
""" API Keys """
key_1 = "AIzaSyDRYg1bgxG-Yv77yqV5RmJuRFAsplOyLg0"
key_2 = "AIzaSyCk-3tjAx3U33d0hdXiYmKP-7m0_kHOFqk"
key_3 = "AIzaSyC8YBijcN_xFfST2Y4KB3YVR7DNnI-FFz8"

In [9]:
""" Getting the final csv file """
""" Merging """
import pandas as pd

# List of input CSV files
csv_files = ["qna.csv", "qna2.csv", "qna3.csv", "qna4.csv"]

# Merge the CSV files
merged_df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Remove duplicate questions within each Item_ID
merged_df = merged_df.drop_duplicates(subset=["Item_ID", "Question"])

# Save the merged DataFrame to qna_main.csv
merged_file_path = "qna_main.csv"
merged_df.to_csv(merged_file_path, index=False)
print(f"Merged file saved as {merged_file_path}")

# Count the number of "Yes" and "No" answers
yes_count = (merged_df["Answer"].str.strip().str.lower() == "yes").sum()
no_count = (merged_df["Answer"].str.strip().str.lower() == "no").sum()

# Print the results
print(f"Number of questions with 'Yes' as the answer: {yes_count}")
print(f"Number of questions with 'No' as the answer: {no_count}")

Merged file saved as qna_main.csv
Number of questions with 'Yes' as the answer: 3186
Number of questions with 'No' as the answer: 498


In [None]:
""" Removing additional yes/no's """

In [13]:
""" Adding the image paths and duplicating """
""" Only if its valid """
import pandas as pd
import os

# File paths
qna_main_path = "qna_main.csv"  # Input file
filtered_merged_path = "Datasets/filtered_merged_output.csv"  # File with image paths
qna_final_path = "qna_final.csv"  # Output file
small_directory = "Datasets/small"  # Directory where images are stored

# Load DataFrames
qna_main_df = pd.read_csv(qna_main_path)
filtered_merged_df = pd.read_csv(filtered_merged_path)

# Add a new column for the image path
qna_main_df["Image_Path"] = None

# Prepend the "small" directory path to each image path
filtered_merged_df["full_path"] = filtered_merged_df["path"].str.strip().apply(lambda p: os.path.join(small_directory, p))

# Create a dictionary of valid image paths for each Item_ID
valid_paths = filtered_merged_df[filtered_merged_df["full_path"].apply(os.path.isfile)]  # Check for valid paths
item_id_to_paths = valid_paths.groupby("item_id")["full_path"].apply(list).to_dict()

# Create a new DataFrame for the final output
final_rows = []

for _, row in qna_main_df.iterrows():
    item_id = row["Item_ID"]
    question = row["Question"]
    answer = row["Answer"]
    
    # Get all valid paths for this Item_ID
    image_paths = item_id_to_paths.get(item_id, [])
    
    # Add a row for each valid image path
    for image_path in image_paths:
        new_row = row.copy()
        new_row["Image_Path"] = image_path
        final_rows.append(new_row)

# Create the final DataFrame
qna_final_df = pd.DataFrame(final_rows)

# Save the final DataFrame to qna_final.csv
qna_final_df.to_csv(qna_final_path, index=False)
print(f"Final file saved as {qna_final_path}")

Final file saved as qna_final.csv


In [14]:
import pandas as pd

# Load the CSV file
qna_final_path = "qna_final.csv"
qna_final_df = pd.read_csv(qna_final_path)

# Print the column names
print("Column names in qna_final.csv:")
print(qna_final_df.columns.tolist())

Column names in qna_final.csv:
['Image_ID', 'Item_ID', 'Question', 'Answer', '61-HYrslQUL', 'B07CSY4P36', 'What color is the chair?', 'Brown', 'Image_Path']
