In [1]:
import pandas as pd
import gdown
import os
import re
import ast
from tqdm.notebook import tqdm
import torch
import timm
from sentence_transformers import SentenceTransformer
from PIL import Image
import requests
from torchvision import transforms
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv

# --- 1. Load & Clean Raw Data ---
print("Loading and cleaning dataset...")
df = pd.read_csv("furniture_dataset.csv")

def clean_price(price):
    if isinstance(price, str):
        try: return float(price.replace('$', '').replace(',', '').strip())
        except ValueError: return None
    return price

def normalize_brand(brand):
    return brand.replace('Store', '').strip() if isinstance(brand, str) else brand

df['price_numeric'] = df['price'].apply(clean_price)
df['brand_normalized'] = df['brand'].apply(normalize_brand)

# CORRECTED: Use modern syntax to avoid warnings
df['description'] = df['description'].fillna('')
df['title'] = df['title'].fillna('')
df['brand_normalized'] = df['brand_normalized'].fillna('')

def safe_literal_eval(val):
    try: return ast.literal_eval(val) if isinstance(val, str) and val.strip() else []
    except (ValueError, SyntaxError): return []

df['categories_list'] = df['categories'].apply(safe_literal_eval)

# --- 2. Feature Engineering for Embeddings ---
df['text_for_embedding'] = df['title'] + '. ' + df['description'] + ' Brand: ' + df['brand_normalized'] + '. Categories: ' + df['categories_list'].apply(lambda x: ', '.join(x))
print("Data cleaning complete.")

# --- 3. Generate Embeddings ---
DEVICE = "cpu"
text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE)
image_model = timm.create_model("resnet50", pretrained=True, num_classes=0).to(DEVICE)
image_model.eval()
data_config = timm.data.resolve_model_data_config(image_model)
image_transforms = timm.data.create_transform(**data_config, is_training=False)

def get_text_embedding(text):
    return text_model.encode(text).tolist() if isinstance(text, str) and text.strip() else [0.0] * 384

def get_image_embedding(image_url):
    try:
        if not isinstance(image_url, str) or not image_url.startswith('http'): raise ValueError("Invalid URL")
        response = requests.get(image_url, stream=True, timeout=10)
        response.raise_for_status()
        image = Image.open(response.raw).convert("RGB")
        with torch.no_grad():
            processed_image = image_transforms(image).unsqueeze(0).to(DEVICE)
            embedding = image_model(processed_image)
        return embedding.cpu().numpy().flatten().tolist()
    except Exception:
        return [0.0] * 2048

tqdm.pandas()
print("\nGenerating text embeddings...")
df['text_embedding'] = df['text_for_embedding'].progress_apply(get_text_embedding)
print("\nGenerating image embeddings...")
df['image_url'] = df['images'].apply(safe_literal_eval).apply(lambda x: x[0] if x else None)
df['image_embedding'] = df['image_url'].progress_apply(get_image_embedding)
print("\nEmbedding generation complete.")

# --- 4. Upload to Pinecone ---
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)
TEXT_INDEX_NAME, IMAGE_INDEX_NAME = "furniture-text-search", "furniture-image-search"

print("Clearing old data from Pinecone indexes...")
text_index = pc.Index(TEXT_INDEX_NAME)
image_index = pc.Index(IMAGE_INDEX_NAME)
text_index.delete(delete_all=True)
image_index.delete(delete_all=True)
print("Indexes cleared.")

BATCH_SIZE = 100
print(f"Starting upsert of new embeddings...")
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch = df.iloc[i:i+BATCH_SIZE]
    text_vectors_to_upsert, image_vectors_to_upsert = [], []
    for _, row in batch.iterrows():
        metadata = {
            'uniq_id': str(row['uniq_id']), 'title': row['title'], 'brand': row['brand_normalized'],
            'price': str(row['price_numeric']) if pd.notna(row['price_numeric']) else '',
            'image_url': row['image_url'] if pd.notna(row['image_url']) else ''
        }
        if any(row['text_embedding']): text_vectors_to_upsert.append((str(row['uniq_id']), row['text_embedding'], metadata))
        if any(row['image_embedding']): image_vectors_to_upsert.append((str(row['uniq_id']), row['image_embedding'], metadata))
    if text_vectors_to_upsert: text_index.upsert(vectors=text_vectors_to_upsert)
    if image_vectors_to_upsert: image_index.upsert(vectors=image_vectors_to_upsert)

print("✅ Success! Your Pinecone database has been updated.")

Loading and cleaning dataset...
Data cleaning complete.

Generating text embeddings...


  0%|          | 0/312 [00:00<?, ?it/s]


Generating image embeddings...


  0%|          | 0/312 [00:00<?, ?it/s]


Embedding generation complete.
Clearing old data from Pinecone indexes...
Indexes cleared.
Starting upsert of new embeddings...


  0%|          | 0/4 [00:00<?, ?it/s]

✅ Success! Your Pinecone database has been updated.


In [2]:
print("--- Checking Metadata in Pinecone ---")

# Connect to the image index
image_index = pc.Index("furniture-image-search")

# Query for the top 1 vector just to get its metadata
query_response = image_index.query(
    vector=[0.0] * 2048, # Dummy vector
    top_k=1,
    include_metadata=True
)

# Print the metadata of the first result, if any
if query_response.matches:
    first_match_metadata = query_response.matches[0].metadata
    print("Metadata for one vector in your image index:")
    print(first_match_metadata)
else:
    print("The image index appears to be empty.")

print("\n--- End of Check ---")

--- Checking Metadata in Pinecone ---
Metadata for one vector in your image index:
{'brand': 'Allied Brass', 'image_url': 'https://m.media-amazon.com/images/I/21+UCtQ6p9L._SS522_.jpg', 'price': '', 'title': 'Allied Brass Carolina Crystal Collection Frameless Oval Tilt Beveled Edge Wall Mirror, Antique Brass', 'uniq_id': 'b7838b38-a622-52b8-b226-972abfab2abc'}

--- End of Check ---
