1️⃣ Install dependencies

In [4]:
!pip install faker pandas diffusers transformers accelerate safetensors torch --quiet



2️⃣ Imports


In [5]:
import json
import pandas as pd
import numpy as np
import random
from faker import Faker
import uuid
import os
import torch
from diffusers import StableDiffusionPipeline

fake = Faker()

3️⃣ Load my 28 real products

In [7]:
with open("products.json", "r") as f:
    real_data = json.load(f)

df_real = pd.json_normalize(real_data)

4️⃣ Extract words from existing titles and descriptions

In [8]:
# Titles and descriptions per category
category_titles = {}
category_descriptions = {}

for cat in df_real['category'].unique():
    titles = df_real[df_real['category'] == cat]['title'].tolist()
    descs = df_real[df_real['category'] == cat]['description'].tolist()
    category_titles[cat] = titles
    category_descriptions[cat] = descs

5️⃣ Synthetic image function

In [None]:
model_id = "runwayml/stable-diffusion-v1-5"

device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

os.makedirs("synthetic_images", exist_ok=True)

def generate_synthetic_image(title, category):
    prompt = (
        f"high quality product photo of {title}, {category.lower()}, "
        "studio lighting, isolated on white background, e-commerce style, ultra realistic"
    )
    image = pipe(prompt, num_inference_steps=25, guidance_scale=7.5).images[0]
    return image


6️⃣ Synthetic product generator

In [10]:
def generate_synthetic_product(idx, category):
    # Sample title from existing titles, slightly modify
    base_title = random.choice(category_titles[category])
    words = base_title.split()
    random.shuffle(words)
    title = " ".join(words)

    # Sample description from existing descriptions, slightly modify
    base_desc = random.choice(category_descriptions[category])
    desc_words = base_desc.split()
    if len(desc_words) > 10:
        # Take random slice for variation
        start = random.randint(0, len(desc_words)-10)
        description = " ".join(desc_words[start:start+12])
    else:
        description = base_desc

    # Price: small random perturbation around original product prices
    prices = df_real[df_real['category'] == category]['price'].tolist()
    base_price = random.choice(prices)
    price = round(np.clip(np.random.normal(base_price, base_price*0.2), 1, None), 2)

    # Ratings
    rating_rate = round(np.clip(np.random.normal(4.5, 0.5), 1.0, 5.0), 1)
    rating_count = int(np.clip(np.random.normal(200, 100), 1, 1000))

    # Payment methods (only card or cash)
    payment_methods = random.sample(["card", "cash"], random.randint(1,2))

    # Availability
    availability = bool(random.random() < 0.9)

    # Generate image
    image = generate_synthetic_image(title, category)
    image_path = f"synthetic_images/product_{idx:03d}.png"
    image.save(image_path)

    return {
        "id": str(uuid.uuid4()),
        "title": title,
        "price": price,
        "description": description,
        "category": category,
        "image": image_path,
        "rating": {"rate": rating_rate, "count": rating_count},
        "payment_methods": payment_methods,
        "availability": availability
    }


7️⃣ Generate 200 synthetic products with balanced category distribution

In [None]:
NUM_TOTAL = 200
synthetic_data = []

# Compute number per category proportional to original data
cat_counts = df_real['category'].value_counts()
cat_probs = cat_counts / cat_counts.sum()

for idx in range(NUM_TOTAL):
    category = random.choices(list(cat_probs.index), weights=cat_probs.values, k=1)[0]
    product = generate_synthetic_product(idx, category)
    synthetic_data.append(product)

8️⃣ Save synthetic dataset

In [None]:
with open("synthetic_200_products_fit_to_data.json", "w") as f:
    json.dump(synthetic_data, f, indent=2)

df_synthetic = pd.json_normalize(synthetic_data)
#df_synthetic.to_csv("synthetic_200_products_fit_to_data.csv", index=False)

print("✅ Synthetic dataset with titles and descriptions fit to data generated successfully!")
