# AI Model Training for Furniture Recommendation Platform

This notebook covers:
1. NLP Model Training (Text Embeddings)
2. Computer Vision Model Training (Image Embeddings)
3. Vector Database Setup
4. Generative AI Integration
5. Model Evaluation and Testing

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import requests
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("✅ Libraries imported successfully")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## 1. Data Loading and Preparation

In [None]:
# Load the cleaned dataset
try:
    df = pd.read_csv('../data/cleaned_furniture_data.csv')
    print(f"✅ Loaded cleaned dataset with {len(df)} products")
except FileNotFoundError:
    # Fallback to original dataset
    print("⚠️ Cleaned dataset not found. Loading and cleaning original dataset...")
    df = pd.read_csv('../data/intern_data_ikarus.csv')
    
    # Quick cleaning
    from utils.helpers import safe_parse_list, clean_price, create_combined_text
    
    df['price_numeric'] = df['price'].apply(clean_price)
    df['categories_list'] = df['categories'].apply(safe_parse_list)
    df['images_list'] = df['images'].apply(safe_parse_list)
    df['combined_text'] = df.apply(create_combined_text, axis=1)
    
    print(f"✅ Cleaned {len(df)} products")

# Display basic info
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Sample products for testing
sample_products = df.sample(n=min(100, len(df)), random_state=42)
print(f"\n📋 Working with {len(sample_products)} sample products for model training")

## 2. NLP Model Training - Text Embeddings

In [None]:
# Initialize Sentence Transformer model
model_name = 'sentence-transformers/all-MiniLM-L6-v2'
print(f"Loading embedding model: {model_name}")

# Load pre-trained model
embedding_model = SentenceTransformer(model_name)
print(f"✅ Model loaded successfully")
print(f"Model dimension: {embedding_model.get_sentence_embedding_dimension()}")

# Test with sample texts
test_texts = [
    "Modern comfortable office chair",
    "Wooden dining table for 6 people",
    "Grey sectional sofa with storage"
]

test_embeddings = embedding_model.encode(test_texts)
print(f"\n🧪 Test embeddings shape: {test_embeddings.shape}")
print(f"Sample embedding (first 10 dimensions): {test_embeddings[0][:10]}")

In [None]:
# Generate embeddings for product texts
print("🔄 Generating embeddings for product texts...")

# Use combined text for embeddings
product_texts = sample_products['combined_text'].fillna('').tolist()

# Generate embeddings in batches to avoid memory issues
batch_size = 32
all_embeddings = []

for i in range(0, len(product_texts), batch_size):
    batch_texts = product_texts[i:i + batch_size]
    batch_embeddings = embedding_model.encode(batch_texts, convert_to_numpy=True)
    all_embeddings.extend(batch_embeddings)
    
    if (i + batch_size) % 100 == 0 or i + batch_size >= len(product_texts):
        print(f"Processed {min(i + batch_size, len(product_texts))}/{len(product_texts)} products")

product_embeddings = np.array(all_embeddings)
print(f"\n✅ Generated embeddings for {len(product_embeddings)} products")
print(f"Embeddings shape: {product_embeddings.shape}")

In [None]:
# Test semantic search functionality
def semantic_search(query, embeddings, texts, top_k=5):
    """
    Perform semantic search using embeddings
    """
    # Encode query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top results
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    results = []
    for idx in top_indices:
        results.append({
            'index': idx,
            'similarity': similarities[idx],
            'text': texts[idx][:200] + '...' if len(texts[idx]) > 200 else texts[idx]
        })
    
    return results

# Test semantic search
test_queries = [
    "comfortable office chair",
    "wooden dining furniture",
    "storage solutions for bedroom"
]

print("🔍 Testing Semantic Search:")
print("=" * 50)

for query in test_queries:
    print(f"\nQuery: '{query}'")
    results = semantic_search(query, product_embeddings, product_texts, top_k=3)
    
    for i, result in enumerate(results, 1):
        print(f"{i}. Similarity: {result['similarity']:.3f}")
        print(f"   Text: {result['text']}\n")

## 3. Computer Vision Model - Image Processing

In [None]:
# Import computer vision libraries
import torchvision.transforms as transforms
import torchvision.models as models
from PIL import Image
import io

# Initialize pre-trained ResNet model for image feature extraction
print("🖼️ Loading computer vision model...")

# Load pre-trained ResNet50
resnet_model = models.resnet50(pretrained=True)
resnet_model.eval()  # Set to evaluation mode

# Remove the final classification layer to get feature vectors
feature_extractor = torch.nn.Sequential(*list(resnet_model.children())[:-1])

# Image preprocessing pipeline
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                        std=[0.229, 0.224, 0.225])
])

print("✅ Computer vision model loaded successfully")

def extract_image_features(image_url):
    """
    Extract features from an image URL
    """
    try:
        # Download image
        response = requests.get(image_url, timeout=10)
        image = Image.open(io.BytesIO(response.content)).convert('RGB')
        
        # Preprocess image
        input_tensor = preprocess(image).unsqueeze(0)
        
        # Extract features
        with torch.no_grad():
            features = feature_extractor(input_tensor)
            features = features.squeeze().numpy()
        
        return features
    
    except Exception as e:
        print(f"Error processing image {image_url}: {str(e)}")
        return None

# Test image feature extraction with a few sample images
print("\n🧪 Testing image feature extraction...")

# Get products with valid image URLs
products_with_images = sample_products[sample_products['images_list'].apply(
    lambda x: isinstance(x, list) and len(x) > 0
)].head(5)

image_features = []
valid_images = []

for idx, product in products_with_images.iterrows():
    if isinstance(product['images_list'], list) and len(product['images_list']) > 0:
        image_url = product['images_list'][0].strip()
        
        if image_url.startswith('http'):
            print(f"Processing: {product['title'][:50]}...")
            features = extract_image_features(image_url)
            
            if features is not None:
                image_features.append(features)
                valid_images.append({
                    'title': product['title'],
                    'url': image_url,
                    'features': features
                })

if image_features:
    image_features_array = np.array(image_features)
    print(f"\n✅ Extracted features from {len(image_features)} images")
    print(f"Feature vector shape: {image_features_array.shape}")
else:
    print("⚠️ No valid images found for feature extraction")

## 4. Generative AI - Product Description Generation

In [None]:
# Initialize Generative AI model for product descriptions
print("🎨 Loading Generative AI model...")

genai_model_name = 'google/flan-t5-small'  # Lightweight model for testing
try:
    tokenizer = AutoTokenizer.from_pretrained(genai_model_name)
    genai_model = AutoModelForSeq2SeqLM.from_pretrained(genai_model_name)
    print(f"✅ Generative AI model loaded: {genai_model_name}")
    genai_available = True
except Exception as e:
    print(f"⚠️ Could not load GenAI model: {str(e)}")
    print("Will use template-based descriptions instead")
    genai_available = False

def generate_product_description(title, category="", material="", color=""):
    """
    Generate creative product description
    """
    if genai_available:
        # Create prompt
        prompt_parts = [
            "Write a creative, engaging product description for this furniture:",
            f"Product: {title}"
        ]
        
        if category:
            prompt_parts.append(f"Category: {category}")
        if material:
            prompt_parts.append(f"Material: {material}")
        if color:
            prompt_parts.append(f"Color: {color}")
        
        prompt = " ".join(prompt_parts)
        
        try:
            # Generate description
            inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True)
            
            with torch.no_grad():
                outputs = genai_model.generate(
                    inputs.input_ids,
                    max_length=80,
                    num_return_sequences=1,
                    temperature=0.7,
                    do_sample=True,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            description = tokenizer.decode(outputs[0], skip_special_tokens=True)
            return description.strip()
        
        except Exception as e:
            print(f"Error generating description: {e}")
    
    # Fallback: template-based description
    templates = [
        f"Discover the perfect blend of style and functionality with this {color} {category}.",
        f"Transform your space with this beautifully crafted {material} {category}.",
        f"Experience comfort and elegance with this premium {category} piece.",
        f"Add sophistication to your home with this {color} {category}."
    ]
    
    # Choose template based on available information
    if color and category:
        return templates[0]
    elif material and category:
        return templates[1]
    elif category:
        return templates[2]
    else:
        return "Enhance your living space with this thoughtfully designed furniture piece."

# Test description generation
print("\n🧪 Testing product description generation:")
print("=" * 60)

test_products = [
    {
        'title': 'Modern Ergonomic Office Chair',
        'category': 'Office Furniture',
        'material': 'Fabric',
        'color': 'Black'
    },
    {
        'title': 'Scandinavian Dining Table',
        'category': 'Dining Room',
        'material': 'Oak Wood',
        'color': 'Natural'
    },
    {
        'title': 'Vintage Leather Armchair',
        'category': 'Living Room',
        'material': 'Leather',
        'color': 'Brown'
    }
]

for product in test_products:
    description = generate_product_description(
        product['title'],
        product['category'],
        product['material'],
        product['color']
    )
    
    print(f"\n📋 Product: {product['title']}")
    print(f"🎨 Generated: {description}")

## 5. Vector Database Setup & Testing

In [None]:
# Simulate vector database operations (in-memory for testing)
class SimpleVectorDB:
    def __init__(self):
        self.vectors = {}
        self.metadata = {}
    
    def upsert(self, product_id, vector, metadata):
        self.vectors[product_id] = vector
        self.metadata[product_id] = metadata
    
    def query(self, query_vector, top_k=10):
        if not self.vectors:
            return []
        
        similarities = []
        for product_id, vector in self.vectors.items():
            similarity = cosine_similarity([query_vector], [vector])[0][0]
            similarities.append({
                'id': product_id,
                'score': float(similarity),
                'metadata': self.metadata[product_id]
            })
        
        # Sort by similarity score
        similarities.sort(key=lambda x: x['score'], reverse=True)
        return similarities[:top_k]
    
    def stats(self):
        return {
            'total_vectors': len(self.vectors),
            'dimension': len(list(self.vectors.values())[0]) if self.vectors else 0
        }

# Initialize vector database
vector_db = SimpleVectorDB()

print("🗄️ Setting up vector database...")

# Add products to vector database
for idx, (_, product) in enumerate(sample_products.iterrows()):
    product_id = product.get('uniq_id', f'product_{idx}')
    
    # Use the embedding we generated earlier
    if idx < len(product_embeddings):
        vector = product_embeddings[idx]
        
        metadata = {
            'title': product.get('title', ''),
            'price': product.get('price_numeric'),
            'category': product.get('categories_list', [{}])[0] if isinstance(product.get('categories_list'), list) and product.get('categories_list') else 'Unknown',
            'material': product.get('material'),
            'color': product.get('color'),
            'brand': product.get('brand')
        }
        
        vector_db.upsert(product_id, vector, metadata)

stats = vector_db.stats()
print(f"✅ Vector database setup complete")
print(f"📊 Stats: {stats['total_vectors']} products, {stats['dimension']} dimensions")

In [None]:
# Test complete search pipeline
def complete_search_pipeline(query, top_k=5):
    """
    Test the complete search pipeline:
    Query -> Embedding -> Vector Search -> Description Generation
    """
    print(f"🔍 Query: '{query}'")
    print("-" * 50)
    
    # 1. Generate query embedding
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)[0]
    
    # 2. Search vector database
    results = vector_db.query(query_embedding, top_k=top_k)
    
    # 3. Generate descriptions and format results
    formatted_results = []
    
    for i, result in enumerate(results, 1):
        metadata = result['metadata']
        
        # Generate AI description
        ai_description = generate_product_description(
            metadata['title'],
            metadata['category'],
            metadata['material'],
            metadata['color']
        )
        
        formatted_result = {
            'rank': i,
            'similarity': result['score'],
            'title': metadata['title'],
            'price': metadata['price'],
            'category': metadata['category'],
            'material': metadata['material'],
            'color': metadata['color'],
            'ai_description': ai_description
        }
        
        formatted_results.append(formatted_result)
        
        # Print result
        print(f"{i}. {metadata['title']}")
        print(f"   💰 Price: ${metadata['price']:.2f}" if metadata['price'] else "   💰 Price: Not available")
        print(f"   📊 Similarity: {result['score']:.3f}")
        print(f"   🎨 AI Description: {ai_description}")
        print()
    
    return formatted_results

# Test with various queries
test_queries = [
    "comfortable office chair for work",
    "dining table for family meals",
    "storage furniture for bedroom"
]

print("🚀 Testing Complete Search Pipeline:")
print("=" * 60)

all_results = {}
for query in test_queries:
    print(f"\n{'='*60}")
    results = complete_search_pipeline(query, top_k=3)
    all_results[query] = results

## 6. Model Evaluation and Performance Metrics

In [None]:
# Evaluate model performance
def evaluate_search_quality(test_cases):
    """
    Evaluate the quality of search results
    """
    print("📈 Evaluating Search Quality:")
    print("=" * 40)
    
    total_queries = len(test_cases)
    relevant_results = 0
    total_results = 0
    
    for query, results in test_cases.items():
        print(f"\nQuery: '{query}'")
        
        query_relevant = 0
        for result in results:
            # Simple relevance check based on keyword matching
            query_words = set(query.lower().split())
            title_words = set(result['title'].lower().split())
            category_words = set((result['category'] or '').lower().split())
            
            # Check if there's overlap
            if query_words.intersection(title_words.union(category_words)):
                query_relevant += 1
            
            total_results += 1
        
        relevant_results += query_relevant
        precision = query_relevant / len(results) if results else 0
        print(f"  Precision: {precision:.2f} ({query_relevant}/{len(results)} relevant)")
    
    overall_precision = relevant_results / total_results if total_results > 0 else 0
    print(f"\n🎯 Overall Precision: {overall_precision:.2f}")
    print(f"📊 Total relevant results: {relevant_results}/{total_results}")
    
    return {
        'overall_precision': overall_precision,
        'total_queries': total_queries,
        'relevant_results': relevant_results,
        'total_results': total_results
    }

# Performance timing
import time

def measure_performance(query, num_runs=5):
    """
    Measure search performance
    """
    times = []
    
    for _ in range(num_runs):
        start_time = time.time()
        
        # Simulate full search pipeline
        query_embedding = embedding_model.encode([query], convert_to_numpy=True)[0]
        results = vector_db.query(query_embedding, top_k=5)
        
        # Generate description for top result
        if results:
            top_result = results[0]['metadata']
            _ = generate_product_description(
                top_result['title'],
                top_result['category'],
                top_result['material'],
                top_result['color']
            )
        
        end_time = time.time()
        times.append(end_time - start_time)
    
    return {
        'avg_time': np.mean(times),
        'min_time': np.min(times),
        'max_time': np.max(times),
        'std_time': np.std(times)
    }

# Run evaluations
print("🔬 Running Model Evaluation:")
print("=" * 50)

# Search quality evaluation
quality_metrics = evaluate_search_quality(all_results)

# Performance evaluation
print("\n⏱️ Performance Evaluation:")
perf_metrics = measure_performance("comfortable office chair", num_runs=3)
print(f"Average search time: {perf_metrics['avg_time']:.3f} seconds")
print(f"Min/Max time: {perf_metrics['min_time']:.3f}s / {perf_metrics['max_time']:.3f}s")

# Model statistics
print("\n📊 Model Statistics:")
print(f"Embedding model dimension: {embedding_model.get_sentence_embedding_dimension()}")
print(f"Vector database size: {stats['total_vectors']} products")
print(f"Generative AI model: {'Available' if genai_available else 'Fallback templates'}")
print(f"Image processing: {'Available' if 'image_features' in locals() and image_features else 'Limited'}")

## 7. Model Export and Deployment Preparation

In [None]:
# Save embeddings and model artifacts for deployment
import pickle
import os

# Create models directory
os.makedirs('../models', exist_ok=True)

print("💾 Saving model artifacts for deployment...")

# Save product embeddings
np.save('../models/product_embeddings.npy', product_embeddings)
print("✅ Product embeddings saved")

# Save product metadata
with open('../models/product_metadata.pkl', 'wb') as f:
    pickle.dump(sample_products.to_dict('records'), f)
print("✅ Product metadata saved")

# Save vector database
with open('../models/vector_db.pkl', 'wb') as f:
    pickle.dump({
        'vectors': vector_db.vectors,
        'metadata': vector_db.metadata
    }, f)
print("✅ Vector database saved")

# Save model configuration
model_config = {
    'embedding_model': model_name,
    'genai_model': genai_model_name if genai_available else None,
    'embedding_dimension': embedding_model.get_sentence_embedding_dimension(),
    'num_products': len(sample_products),
    'performance_metrics': {
        'precision': quality_metrics['overall_precision'],
        'avg_search_time': perf_metrics['avg_time']
    },
    'created_at': pd.Timestamp.now().isoformat()
}

with open('../models/model_config.json', 'w') as f:
    import json
    json.dump(model_config, f, indent=2)
print("✅ Model configuration saved")

print(f"\n🎉 Model training and evaluation complete!")
print(f"📁 Artifacts saved in '../models/' directory")
print(f"🚀 Ready for deployment integration")

## 8. Model Deployment Integration Guide

In [None]:
# Generate deployment integration code
deployment_code = '''
# AI Models Integration for FastAPI Backend
# Add this code to your AIModelManager class

import numpy as np
import pickle
import json
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

class AIModelManager:
    def __init__(self, settings):
        self.settings = settings
        self.embedding_model = None
        self.genai_model = None
        self.genai_tokenizer = None
        self.vector_db = {}
        self.product_metadata = []
        
    async def load_pretrained_models(self):
        """Load pre-trained models from training artifacts"""
        
        # Load model configuration
        with open('models/model_config.json', 'r') as f:
            config = json.load(f)
        
        # Load embedding model
        self.embedding_model = SentenceTransformer(config['embedding_model'])
        
        # Load GenAI model if available
        if config['genai_model']:
            self.genai_tokenizer = AutoTokenizer.from_pretrained(config['genai_model'])
            self.genai_model = AutoModelForSeq2SeqLM.from_pretrained(config['genai_model'])
        
        # Load vector database
        with open('models/vector_db.pkl', 'rb') as f:
            db_data = pickle.load(f)
            self.vector_db = db_data
        
        # Load product metadata
        with open('models/product_metadata.pkl', 'rb') as f:
            self.product_metadata = pickle.load(f)
        
        logger.info(f"Loaded {len(self.vector_db['vectors'])} product vectors")
        logger.info(f"Model performance: {config['performance_metrics']}")
        
        return True

# Usage in main.py:
# 1. Replace the embedding generation code with:
#    await ai_manager.load_pretrained_models()
# 
# 2. The vector search will work immediately with the loaded data
#
# 3. All search functionality is ready for production use
'''

with open('../models/deployment_guide.py', 'w') as f:
    f.write(deployment_code)

print("📚 Deployment Guide Generated:")
print("=" * 40)
print("1. Copy model artifacts to your backend/models/ directory")
print("2. Update AIModelManager with the deployment code")
print("3. Call load_pretrained_models() instead of training from scratch")
print("4. Your search API will be ready with trained embeddings!")
print("\n✨ Training complete and ready for deployment!")