# 01 - Data Processing and Embedding Generation

This notebook covers:
- Loading and preprocessing text and image data
- Generating embeddings using CLIP and OpenAI models
- Saving processed embeddings for indexing

In [None]:
# Install required packages
# !pip install -r ../requirements.txt

In [None]:
import os
import sys
sys.path.append('../src')

import torch
import clip
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
DATA_DIR = Path('../data/raw')
PROCESSED_DIR = Path('../data/processed')
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

## 1. Load CLIP Model

In [None]:
# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

clip_model, preprocess = clip.load("ViT-B/32", device=device)
print("CLIP model loaded successfully")

## 2. Load Sample Dataset

Example structure:
```
data/raw/
├── images/
│   ├── product_001.jpg
│   ├── product_002.jpg
│   └── ...
└── metadata.csv  # Contains: id, title, description, image_path, category
```

In [None]:
# Load metadata
metadata_path = DATA_DIR / 'metadata.csv'

# Create sample data if not exists
if not metadata_path.exists():
    print("Creating sample metadata...")
    sample_data = {
        'id': ['prod_001', 'prod_002', 'prod_003'],
        'title': ['Red T-Shirt', 'Blue Jeans', 'White Sneakers'],
        'description': [
            'Comfortable red cotton t-shirt with round neck',
            'Classic blue denim jeans with regular fit',
            'White canvas sneakers with rubber sole'
        ],
        'image_path': ['images/prod_001.jpg', 'images/prod_002.jpg', 'images/prod_003.jpg'],
        'category': ['clothing', 'clothing', 'footwear']
    }
    df = pd.DataFrame(sample_data)
    df.to_csv(metadata_path, index=False)
else:
    df = pd.read_csv(metadata_path)

print(f"Loaded {len(df)} items")
df.head()

## 3. Generate Text Embeddings (OpenAI)

In [None]:
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

def get_text_embedding(text, model="text-embedding-3-large"):
    """Generate embedding for text using OpenAI API"""
    try:
        response = client.embeddings.create(
            input=text,
            model=model
        )
        return response.data[0].embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

# Generate embeddings for all text descriptions
print("Generating text embeddings...")
text_embeddings = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    # Combine title and description
    combined_text = f"{row['title']}. {row['description']}"
    embedding = get_text_embedding(combined_text)
    text_embeddings.append(embedding)

df['text_embedding'] = text_embeddings
print(f"Generated {len(text_embeddings)} text embeddings")

## 4. Generate Image Embeddings (CLIP)

In [None]:
def get_image_embedding(image_path):
    """Generate embedding for image using CLIP"""
    try:
        image = Image.open(image_path).convert('RGB')
        image_input = preprocess(image).unsqueeze(0).to(device)
        
        with torch.no_grad():
            image_features = clip_model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
        
        return image_features.cpu().numpy()[0].tolist()
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return None

# Generate image embeddings
print("Generating image embeddings...")
image_embeddings = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    image_path = DATA_DIR / row['image_path']
    if image_path.exists():
        embedding = get_image_embedding(image_path)
        image_embeddings.append(embedding)
    else:
        print(f"Image not found: {image_path}")
        image_embeddings.append(None)

df['image_embedding'] = image_embeddings
print(f"Generated {len([e for e in image_embeddings if e is not None])} image embeddings")

## 5. Generate CLIP Text Embeddings (for multimodal alignment)

In [None]:
def get_clip_text_embedding(text):
    """Generate text embedding using CLIP (aligned with image space)"""
    try:
        text_input = clip.tokenize([text]).to(device)
        
        with torch.no_grad():
            text_features = clip_model.encode_text(text_input)
            text_features /= text_features.norm(dim=-1, keepdim=True)
        
        return text_features.cpu().numpy()[0].tolist()
    except Exception as e:
        print(f"Error generating CLIP text embedding: {e}")
        return None

# Generate CLIP text embeddings
print("Generating CLIP text embeddings...")
clip_text_embeddings = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    combined_text = f"{row['title']}. {row['description']}"
    embedding = get_clip_text_embedding(combined_text)
    clip_text_embeddings.append(embedding)

df['clip_text_embedding'] = clip_text_embeddings
print(f"Generated {len(clip_text_embeddings)} CLIP text embeddings")

## 6. Save Processed Data

In [None]:
# Save embeddings as separate numpy files for efficiency
import pickle

# Save text embeddings
text_emb_array = np.array([e for e in df['text_embedding'] if e is not None])
np.save(PROCESSED_DIR / 'text_embeddings.npy', text_emb_array)

# Save image embeddings
image_emb_array = np.array([e for e in df['image_embedding'] if e is not None])
np.save(PROCESSED_DIR / 'image_embeddings.npy', image_emb_array)

# Save CLIP text embeddings
clip_text_emb_array = np.array([e for e in df['clip_text_embedding'] if e is not None])
np.save(PROCESSED_DIR / 'clip_text_embeddings.npy', clip_text_emb_array)

# Save metadata (without embeddings for readability)
df_metadata = df[['id', 'title', 'description', 'image_path', 'category']].copy()
df_metadata.to_csv(PROCESSED_DIR / 'metadata_processed.csv', index=False)

# Save full dataframe with embeddings
with open(PROCESSED_DIR / 'full_data.pkl', 'wb') as f:
    pickle.dump(df, f)

print("\nData saved successfully!")
print(f"Text embeddings shape: {text_emb_array.shape}")
print(f"Image embeddings shape: {image_emb_array.shape}")
print(f"CLIP text embeddings shape: {clip_text_emb_array.shape}")

## 7. Visualization (Optional)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Visualize embeddings using t-SNE
def visualize_embeddings(embeddings, labels, title):
    if len(embeddings) < 2:
        print("Not enough samples for visualization")
        return
    
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_2d = tsne.fit_transform(embeddings)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], 
                         c=range(len(labels)), cmap='viridis', s=100)
    
    for i, label in enumerate(labels):
        plt.annotate(label, (embeddings_2d[i, 0], embeddings_2d[i, 1]), 
                    fontsize=8, alpha=0.7)
    
    plt.title(title)
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.colorbar(scatter)
    plt.tight_layout()
    plt.show()

# Visualize CLIP text embeddings
if len(clip_text_emb_array) > 0:
    visualize_embeddings(
        clip_text_emb_array,
        df_metadata['title'].tolist(),
        'CLIP Text Embeddings (t-SNE)'
    )

## Summary

In this notebook, we:
1. Loaded and preprocessed multimodal data
2. Generated text embeddings using OpenAI's text-embedding-3-large
3. Generated image embeddings using CLIP
4. Generated CLIP text embeddings for multimodal alignment
5. Saved all processed embeddings for vector database indexing

Next step: Notebook 02 - Vector Database Setup and Indexing