In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# --- 1. DATA SETUP: Item Inventory and Feature List ---

# Max stock for normalization/threshold setting
MAX_STOCK = 50
LOW_STOCK_THRESHOLD = 5
STOCK_ADJUSTMENT_FACTOR = 0.9 # Low stock items get 10% score reduction

# --- A. Mock Data using the user's provided columns ---
# NOTE: In a real environment, you would load this from your source (e.g., CSV/DB)

df_items = pd.read_csv("ready_for_ml_adjusted.csv")

# --- B. FEATURE CONSOLIDATION ---
# This is the essential step to clean your raw data for the ML model.

df_items['has_diamond'] = df_items['diamond'] | df_items['diamonds']
df_items['has_amethyst'] = df_items['amethyst'] | df_items['amethysts']
df_items['has_blue_sapphire'] = df_items['sapphire'] | df_items['sapphires'] # Assuming generic sapphire is blue
df_items['has_pink_sapphire'] = df_items['pink sapphire'] | df_items['pink sapphires']
# ... continue consolidation for all singular/plural pairs

# --- C. Final Feature List (Must be comprehensive) ---
CONTENT_FEATURES = [
    # Numerical
    'price_norm',
    # Consolidated Gems (Add all 30+ from your list here)
    'has_diamond', 'has_amethyst', 'has_blue_sapphire', 'has_pink_sapphire',
    # Metals
    'yellow gold', 'white gold', 'platinum', 'pink gold',
    # Product Types
    'rings', 'earrings', 'necklaces', 'bracelets'
    # ... all other 30+ consolidated features
]

# --- D. Normalize Price and Create Final Item Feature Matrix ---
scaler = MinMaxScaler()
df_items['price_norm'] = scaler.fit_transform(df_items[['price']])
I = df_items[CONTENT_FEATURES].values

print(f"Total features used by the model: {len(CONTENT_FEATURES)}")

Total features used by the model: 13


In [None]:
# --- 2. THEME MAPPING: Generalized Knowledge Base (T) ---
# NOTE: The weights below are examples. You must set these based on your domain knowledge.
THEME_WEIGHTS = {
    "Formal Evening Gala": {
        'type_necklaces': 1.0, 'has_diamond': 0.9, 'metal_white_gold': 0.7, 'price_norm': 0.8
    },
    "Casual Beach Day": {
        'type_bracelet': 0.9, 'has_aquamarine': 1.0, 'metal_yellow_gold': 0.5, 'price_norm': 0.1
    },
    "Business Professional": {
        'type_earrings': 0.8, 'metal_platinum': 0.9, 'price_norm': 0.5
    },
    "Romantic Date Night": {
        'rings': 0.7, 'has_pink_sapphire': 1.0, 'pink gold': 1.0, 'price_norm': 0.6
    },
    "Outdoor Adventure": {
        'type_bracelet': 0.7, 'has_carnelian': 0.8, 'metal_yellow_gold': 0.3, 'price_norm': 0.1
    },
    "The 5 remaining themes would be defined here": {'price_norm': 0.2},
    "Default Theme": {'price_norm': 0.1}
}
theme_vectors = {}
for theme, weights in THEME_WEIGHTS.items():
    theme_vectors[theme] = pd.Series(weights).reindex(CONTENT_FEATURES, fill_value=0)

# --- 3. USER INPUT (Simulated JSON) ---
USER_CONTEXT = {
    "theme": "Romantic Date Night",
    "preferences_text": "I am wearing a green dress and like yellow metal.",
    "item_type_focus": "rings", # Must match a column: 'rings', 'earrings', 'necklaces', 'bracelets'
    "price_min": 1000,
    "price_max": 2000
}
PRICE_TOLERANCE_PERCENT = 0.05

# --- 4. ML MODEL: TF-IDF Setup for Preference Text ---
corpus = df_items['title'].tolist()
tfidf_vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\b\w+\b')
tfidf_vectorizer.fit(corpus)
vocab = tfidf_vectorizer.get_feature_names_out()

# Generalized mapping function to check against the CONSOLIDATED features
def map_nlp_to_features(word, score, U_nlp_vector):
    # Map common preference words to item features
    if 'blue' in word: U_nlp_vector['has_blue_sapphire'] = max(U_nlp_vector.get('has_blue_sapphire', 0), score)
    if 'yellow' in word or 'metal' in word: U_nlp_vector['yellow gold'] = max(U_nlp_vector.get('yellow gold', 0), score)
    if 'sparkle' in word or 'diamond' in word: U_nlp_vector['has_diamond'] = max(U_nlp_vector.get('has_diamond', 0), score)

# --- 5. BUILD COMBINED USER VECTOR (U) ---

U_nlp_vector = pd.Series(0.0, index=CONTENT_FEATURES)
user_doc = USER_CONTEXT['preferences_text']
user_tfidf_sparse = tfidf_vectorizer.transform([user_doc])
user_tfidf_vector = user_tfidf_sparse.toarray()[0]

for word, score in zip(vocab, user_tfidf_vector):
    if score > 0:
        map_nlp_to_features(word, score, U_nlp_vector)

U_theme_vector = theme_vectors.get(USER_CONTEXT['theme'], theme_vectors['Default Theme'])

U_type_vector = pd.Series(0.0, index=CONTENT_FEATURES)
type_focus_col = USER_CONTEXT["item_type_focus"]
if type_focus_col in CONTENT_FEATURES:
    U_type_vector[type_focus_col] = 1.0

# Tune Weights (Example: Theme=60%, NLP=30%, Explicit Type=10%)
ALPHA, BETA, GAMMA = 0.6, 0.3, 0.1

U_combined = (ALPHA * U_theme_vector + BETA * U_nlp_vector + GAMMA * U_type_vector)
U = U_combined.values.reshape(1, -1)
# print("--- U Vector (Combined Weights) ---")
# print(pd.Series(U[0], index=CONTENT_FEATURES).round(3))

In [None]:
# --- 6. FILTERING AND SCORING ---

# 6.1. Price Filtering (Hard Constraint)
min_p = USER_CONTEXT['price_min'] * (1 - PRICE_TOLERANCE_PERCENT)
max_p = USER_CONTEXT['price_max'] * (1 + PRICE_TOLERANCE_PERCENT)

df_filtered = df_items[
    (df_items['quantity'] > 0) &
    (df_items['price'] >= min_p) &
    (df_items['price'] <= max_p)
].copy()

if df_filtered.empty:
    print("\nNo items matched the price and stock constraints.")
    exit()

I_filtered = df_filtered[CONTENT_FEATURES].values

# 6.2. Core Similarity Calculation (Content-Based Score)
similarity_scores = cosine_similarity(U, I_filtered)[0]
df_filtered['similarity_score'] = similarity_scores

# 6.3. Stock Adjustment (Soft Constraint)
is_low_stock = df_filtered['quantity'] <= LOW_STOCK_THRESHOLD
df_filtered['stock_adjustment'] = np.where(is_low_stock, STOCK_ADJUSTMENT_FACTOR, 1.0)

# Final Score Calculation
df_filtered['final_score'] = df_filtered['similarity_score'] * df_filtered['stock_adjustment']

# --- 7. FINAL RECOMMENDATIONS ---
top_k = 5
recommendations = df_filtered.sort_values(
    by='final_score',
    ascending=False
).head(top_k)[['ref', 'title', 'price', 'quantity', 'final_score']]

print(f"\n--- Final Recommendations (Top {top_k}) ---")
print(recommendations.to_string(index=False))


--- Final Recommendations (Top 5) ---
      ref                         title  price  quantity  final_score
 B4084800                     Love ring 1650.0        23     0.749078
 B4051100  Logo de Cartier wedding band 1380.0         7     0.748903
 B4079800 Maillon Panthère wedding band 1130.0        12     0.748741
 B4225800         Juste un Clou ring SM 1120.0        38     0.748735
 B4085200             Love wedding band 1070.0        11     0.748702


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from typing import List, Dict, Optional
import json

class JewelryRecommendationSystem:
    """
    Agentic AI + E-Commerce Database Jewelry Recommendation System
    Based on the project proposal for styling and sales uplift
    """

    def __init__(self, jewelry_data_path: str = None):
        """
        Initialize the recommendation system

        Args:
            jewelry_data_path: Path to CSV file with jewelry inventory
        """
        self.jewelry_df = None
        self.recommendation_log = []

        # Define style vocabularies for consistent tagging
        self.style_tags = ['elegant', 'minimalist', 'boho', 'classic',
                          'modern', 'vintage', 'casual', 'formal']

        # Material categories from your data columns
        self.materials = ['amazonite', 'amethyst', 'aquamarines', 'aventurine',
                         'black ceramic', 'carnelian', 'ceramic', 'chrysoprase',
                         'citrine', 'coral', 'diamond', 'emeralds', 'garnets',
                         'lapis lazuli', 'malachite', 'mother_of_pearl', 'obsidians',
                         'onyx', 'pearl', 'peridots', 'pink gold', 'pink sapphire',
                         'platinum', 'rubies', 'sapphire', 'spessartite garnet',
                         'spinels', 'tsavorite garnet', 'white gold', 'yellow gold']

        # Category columns from your data
        self.categories = ['bracelets', 'earrings', 'necklaces', 'rings']

        if jewelry_data_path:
            self.load_data(jewelry_data_path)

    def load_data(self, data_path: str):
        """Load jewelry inventory data from CSV"""
        self.jewelry_df = pd.read_csv(data_path)
        print(f"Loaded {len(self.jewelry_df)} jewelry items")

        # Preprocess: ensure quantity is numeric
        self.jewelry_df['quantity'] = pd.to_numeric(self.jewelry_df['quantity'], errors='coerce').fillna(0)
        self.jewelry_df['price'] = pd.to_numeric(self.jewelry_df['price'], errors='coerce')

    def create_sample_data(self, n_items: int = 100):
        """
        Create sample jewelry data matching your schema
        Useful for testing when you don't have the full dataset yet
        """
        np.random.seed(42)

        data = {
            'ref': [f'JEWEL{str(i).zfill(4)}' for i in range(n_items)],
            'title': [self._generate_title() for _ in range(n_items)],
            'price': np.random.randint(50, 2000, n_items),
            'image': [f'https://example.com/jewelry/{i}.jpg' for i in range(n_items)],
            'quantity': np.random.randint(0, 50, n_items),
        }

        # Add category columns (one-hot encoded)
        for cat in self.categories:
            data[cat] = np.random.choice([0, 1], n_items, p=[0.75, 0.25])

        # Add material columns (one-hot encoded)
        for mat in self.materials:
            data[mat] = np.random.choice([0, 1], n_items, p=[0.9, 0.1])

        self.jewelry_df = pd.DataFrame(data)
        print(f"Created {n_items} sample jewelry items")

    def _generate_title(self):
        """Generate realistic jewelry titles"""
        adj = np.random.choice(['Classic', 'Elegant', 'Modern', 'Vintage', 'Delicate', 'Bold'])
        material = np.random.choice(['Gold', 'Silver', 'Diamond', 'Pearl', 'Gemstone'])
        item = np.random.choice(['Ring', 'Necklace', 'Earrings', 'Bracelet'])
        return f"{adj} {material} {item}"

    def extract_item_features(self, item_row):
        """
        Extract features from a jewelry item row
        Returns: dict with category, materials, and inferred styles
        """
        features = {
            'category': None,
            'materials': [],
            'styles': []
        }

        # Extract category
        for cat in self.categories:
            if cat in item_row and item_row[cat] == 1:
                features['category'] = cat
                break

        # Extract materials
        for mat in self.materials:
            if mat in item_row and item_row[mat] == 1:
                features['materials'].append(mat)

        # Infer styles based on materials and category
        # This is a simple rule-based approach; you can make it more sophisticated
        features['styles'] = self._infer_styles(features)

        return features

    def _infer_styles(self, features: Dict) -> List[str]:
        """
        Infer style tags from materials and category
        This is a rule-based heuristic; you can improve with ML later
        """
        styles = []
        materials = features.get('materials', [])

        # Material-based style inference
        if 'diamond' in materials or 'platinum' in materials:
            styles.extend(['elegant', 'formal', 'classic'])
        if 'pearl' in materials:
            styles.extend(['elegant', 'vintage'])
        if 'yellow gold' in materials or 'white gold' in materials:
            styles.extend(['classic', 'modern'])
        if 'pink gold' in materials:
            styles.extend(['modern', 'minimalist'])
        if any(stone in materials for stone in ['turquoise', 'carnelian', 'malachite']):
            styles.append('boho')

        # Category-based style inference
        category = features.get('category')
        if category == 'rings':
            styles.append('formal')
        elif category == 'bracelets':
            styles.extend(['casual', 'boho'])
        elif category == 'necklaces':
            styles.extend(['elegant', 'classic'])

        # Remove duplicates and ensure we have at least 2-3 styles
        styles = list(set(styles))
        if not styles:
            styles = ['classic', 'modern']

        return styles[:5]  # Limit to top 5 styles

    def recommend_jewelry(self,
                         budget_min: float,
                         budget_max: float,
                         style_preferences: List[str] = None,
                         event: str = None,
                         category: str = None,
                         material_preferences: List[str] = None,
                         top_k: int = 6,
                         session_id: str = None) -> List[Dict]:
        """
        Main recommendation function implementing the scoring logic from your proposal

        Args:
            budget_min: Minimum price
            budget_max: Maximum price
            style_preferences: List of style tags (e.g., ['minimalist', 'modern'])
            event: Event type (e.g., 'wedding', 'casual')
            category: Jewelry category (e.g., 'rings', 'necklaces')
            material_preferences: List of preferred materials
            top_k: Number of recommendations to return
            session_id: Session identifier for logging

        Returns:
            List of recommended items with scores and explanations
        """
        if self.jewelry_df is None:
            raise ValueError("No data loaded. Call load_data() or create_sample_data() first.")

        if session_id is None:
            session_id = f"session_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

        # STEP 1: Apply hard filters
        filtered_df = self.jewelry_df.copy()

        # Filter 1: Must be in stock
        filtered_df = filtered_df[filtered_df['quantity'] > 0]

        # Filter 2: Must be within budget
        filtered_df = filtered_df[
            (filtered_df['price'] >= budget_min) &
            (filtered_df['price'] <= budget_max)
        ]

        # Filter 3: Category filter (if specified)
        if category:
            if category in self.categories:
                filtered_df = filtered_df[filtered_df[category] == 1]

        # Filter 4: Material filter (if specified)
        if material_preferences:
            material_mask = pd.Series([False] * len(filtered_df), index=filtered_df.index)
            for material in material_preferences:
                if material in filtered_df.columns:
                    material_mask |= (filtered_df[material] == 1)
            filtered_df = filtered_df[material_mask]

        if len(filtered_df) == 0:
            print("No items match your filters. Try broadening your criteria.")
            return []

        # STEP 2: Score and rank items
        recommendations = []
        budget_mid = (budget_min + budget_max) / 2
        budget_range = budget_max - budget_min

        for idx, row in filtered_df.iterrows():
            # Extract item features
            item_features = self.extract_item_features(row)
            item_styles = item_features['styles']

            # Calculate style fit (overlap count)
            style_overlap = 0
            if style_preferences:
                style_overlap = len(set(style_preferences) & set(item_styles))

            # Calculate price fit (favor items near budget midpoint)
            price_diff = abs(row['price'] - budget_mid)
            price_fit = 1 - (price_diff / budget_range) if budget_range > 0 else 1
            price_fit = max(0, price_fit)  # Ensure non-negative

            # Final score: overlap + 0.5 × price_fit (from your proposal)
            final_score = style_overlap + (0.5 * price_fit)

            # Build recommendation object
            recommendation = {
                'ref': row['ref'],
                'title': row['title'],
                'price': float(row['price']),
                'image': row['image'],
                'quantity': int(row['quantity']),
                'category': item_features['category'],
                'materials': item_features['materials'],
                'styles': item_styles,
                'score': final_score,
                'style_overlap': style_overlap,
                'price_fit': price_fit,
                'explanation': self._generate_explanation(style_overlap, price_fit, item_features, style_preferences)
            }

            recommendations.append(recommendation)

        # STEP 3: Sort by score and return top K
        recommendations.sort(key=lambda x: x['score'], reverse=True)
        top_recommendations = recommendations[:top_k]

        # STEP 4: Log recommendations
        self._log_recommendations(session_id, top_recommendations, {
            'budget_min': budget_min,
            'budget_max': budget_max,
            'style_preferences': style_preferences,
            'event': event,
            'category': category,
            'material_preferences': material_preferences
        })

        return top_recommendations

    def _generate_explanation(self, style_overlap, price_fit, features, user_styles):
        """Generate human-readable explanation for why item was recommended"""
        explanations = []

        if style_overlap > 0:
            matching_styles = set(features['styles']) & set(user_styles or [])
            if matching_styles:
                explanations.append(f"Matches your {', '.join(list(matching_styles)[:2])} style")

        if price_fit > 0.7:
            explanations.append("Great value within your budget")

        if not explanations:
            explanations.append("In stock and within budget")

        return "; ".join(explanations)

    def _log_recommendations(self, session_id, recommendations, user_inputs):
        """Log recommendations for analytics (simulating MongoDB storage)"""
        log_entry = {
            'session_id': session_id,
            'timestamp': datetime.now().isoformat(),
            'user_inputs': user_inputs,
            'recommendations': [
                {
                    'ref': rec['ref'],
                    'rank': idx + 1,
                    'score': rec['score'],
                    'price': rec['price']
                }
                for idx, rec in enumerate(recommendations)
            ],
            'num_results': len(recommendations)
        }

        self.recommendation_log.append(log_entry)

    def get_recommendation_log(self) -> List[Dict]:
        """Retrieve recommendation logs for analytics"""
        return self.recommendation_log

    def analyze_trends(self) -> Dict:
        """
        Analyze recommendation logs to identify trends
        Useful for merchandising team use case
        """
        if not self.recommendation_log:
            return {"message": "No logs available yet"}

        all_inputs = [log['user_inputs'] for log in self.recommendation_log]

        # Count most requested styles
        style_counts = {}
        for inputs in all_inputs:
            styles = inputs.get('style_preferences', [])
            for style in styles:
                style_counts[style] = style_counts.get(style, 0) + 1

        # Count most requested categories
        category_counts = {}
        for inputs in all_inputs:
            cat = inputs.get('category')
            if cat:
                category_counts[cat] = category_counts.get(cat, 0) + 1

        return {
            'total_sessions': len(self.recommendation_log),
            'popular_styles': sorted(style_counts.items(), key=lambda x: x[1], reverse=True)[:5],
            'popular_categories': sorted(category_counts.items(), key=lambda x: x[1], reverse=True),
            'avg_budget': np.mean([inp.get('budget_max', 0) for inp in all_inputs])
        }


# Example usage
if __name__ == "__main__":
    # Initialize system
    system = JewelryRecommendationSystem()

    system.load_data('ready_for_ml_adjusted.csv')

    # Example recommendation request
    print("\n" + "="*60)
    print("EXAMPLE: Getting recommendations for a formal wedding")
    print("="*60)

    recommendations = system.recommend_jewelry(
        budget_min=2000,
        budget_max=3000,
        style_preferences=['elegant', 'formal', 'classic'],
        event='wedding',
        category='earrings',
        material_preferences=['diamond', 'pearl'],
        top_k=6
    )

    print(f"\nFound {len(recommendations)} recommendations:\n")

    for i, rec in enumerate(recommendations, 1):
        print(f"   Ref Number: {rec['ref']}")
        print(f"   Image: {rec['image']}")
        print(f"{i}. {rec['title']}")
        print(f"   Price: ${rec['price']}")
        print(f"   Score: {rec['score']:.2f}")
        print(f"   Styles: {', '.join(rec['styles'][:3])}")
        print(f"   Materials: {', '.join(rec['materials'][:3])}")
        print(f"   Explanation: {rec['explanation']}")
        print(f"   Stock: {rec['quantity']} available")
        print()

    # Show analytics
    print("\n" + "="*60)
    print("ANALYTICS DASHBOARD")
    print("="*60)

    trends = system.analyze_trends()
    print(json.dumps(trends, indent=2))

Loaded 692 jewelry items

EXAMPLE: Getting recommendations for a formal wedding

Found 2 recommendations:

   Ref Number:  B8041800
   Image: https://www.cartier.com/content/dam/rcq/car/15/43/36/5/1543365.png
1. C de Cartier earrings
   Price: $2220.0
   Score: 2.36
   Styles: modern, elegant, vintage
   Materials: pearl, white gold
   Explanation: Matches your elegant, classic style; Great value within your budget
   Stock: 44 available

   Ref Number:  B8041700
   Image: https://www.cartier.com/content/dam/rcq/car/15/43/35/7/1543357.png
2. C de Cartier earrings
   Price: $2080.0
   Score: 1.29
   Styles: elegant, vintage, modern
   Materials: pearl, pink gold
   Explanation: Matches your elegant style
   Stock: 50 available


ANALYTICS DASHBOARD
{
  "total_sessions": 1,
  "popular_styles": [
    [
      "elegant",
      1
    ],
    [
      "formal",
      1
    ],
    [
      "classic",
      1
    ]
  ],
  "popular_categories": [
    [
      "earrings",
      1
    ]
  ],
  "avg_bu