In [None]:
# ================================================================
# PHASE 2 - RECOMMENDATION SYSTEMS
# ================================================================
# 
# Goal: Build hybrid recommendation system combining:
#   - Collaborative Filtering (user-item interactions)
#   - Content-Based Filtering (product features)
#   - Segment-aware recommendations
#
# Sections:
#   2.0 - Setup & Data Loading
#   2.1 - Collaborative Filtering (CF)
#   2.2 - Content-Based Filtering (CBF)
#   2.3 - Hybrid Recommendations
#   2.4 - Evaluation
#   2.5 - Segment-Specific Recommendations
# ================================================================

In [None]:
# ================================================================
# 2.0 - Setup & Data Loading
# ================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import normalize
from collections import defaultdict
import pickle
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore')

# Set random seed
SEED = 000
np.random.seed(SEED)

# Visualization settings
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("="*60)
print("PHASE 2 - RECOMMENDATION SYSTEMS")
print("="*60 + "\n")

# -----------------------------------------------
# Load Phase 0 Data (Temporal Split)
# -----------------------------------------------
print("Loading temporal split data from Phase 0...")

orders_train = pd.read_parquet('../data/processed/orders_train.parquet')
orders_val = pd.read_parquet('../data/processed/orders_val.parquet')
orders_test = pd.read_parquet('../data/processed/orders_test.parquet')

order_products_train = pd.read_parquet('../data/processed/order_products_train.parquet')
order_products_val = pd.read_parquet('../data/processed/order_products_val.parquet')
order_products_test = pd.read_parquet('../data/processed/order_products_test.parquet')

products = pd.read_parquet('../data/processed/products.parquet')
departments = pd.read_parquet('../data/processed/departments.parquet')
aisles = pd.read_parquet('../data/processed/aisles.parquet')

print(f" Loaded temporal splits:")
print(f"   Train: {len(orders_train):,} orders, {len(order_products_train):,} items")
print(f"   Val:   {len(orders_val):,} orders, {len(order_products_val):,} items")
print(f"   Test:  {len(orders_test):,} orders, {len(order_products_test):,} items")

# -----------------------------------------------
# Load Phase 1 Data (Clusters)
# -----------------------------------------------
print("\nLoading cluster assignments from Phase 1...")

user_features_clustered = pd.read_parquet('../data/processed/user_features_raw_clustered.parquet')

print(f" Loaded cluster assignments for {len(user_features_clustered):,} users")
print(f"   Clusters: {user_features_clustered['cluster'].nunique()}")
print(f"   Segment names: {user_features_clustered['segment_name'].unique().tolist()}")

# -----------------------------------------------
# Merge cluster info with orders
# -----------------------------------------------
print("\nMerging cluster information with temporal splits...")

orders_train = orders_train.merge(
    user_features_clustered[['user_id', 'cluster', 'segment_name']], 
    on='user_id', 
    how='left'
)

orders_val = orders_val.merge(
    user_features_clustered[['user_id', 'cluster', 'segment_name']], 
    on='user_id', 
    how='left'
)

orders_test = orders_test.merge(
    user_features_clustered[['user_id', 'cluster', 'segment_name']], 
    on='user_id', 
    how='left'
)

print(f" Cluster info merged with orders")

PHASE 2 - RECOMMENDATION SYSTEMS

Loading temporal split data from Phase 0...
 Loaded temporal splits:
   Train: 2,880,077 orders, 29,014,490 items
   Val:   175,072 orders, 1,830,111 items
   Test:  175,072 orders, 1,861,372 items

Loading cluster assignments from Phase 1...
 Loaded cluster assignments for 175,072 users
   Clusters: 5
   Segment names: ['Routine Snackers', 'Power Users', 'Bulk Shoppers', 'Household Essentials', 'Alcohol Enthusiasts']

Merging cluster information with temporal splits...
 Cluster info merged with orders


In [7]:
# ================================================================
# 2.1 - Collaborative Filtering (CF)
# ================================================================

print("\n" + "="*60)
print("2.1 - Collaborative Filtering")
print("="*60 + "\n")

# -----------------------------------------------
# Install surprise if needed
# -----------------------------------------------

from surprise import Dataset, Reader, SVD

print(" Surprise library imported")

# -----------------------------------------------
# Prepare data for Surprise
# -----------------------------------------------
print("\nPreparing user-item interaction data...")

# Merge train orders with products
train_interactions = order_products_train.merge(
    orders_train[['order_id', 'user_id']], 
    on='order_id'
)

print(f"Training interactions: {len(train_interactions):,}")
print(f"Unique users: {train_interactions['user_id'].nunique():,}")
print(f"Unique products: {train_interactions['product_id'].nunique():,}")

# Create rating-like data
# Use log-transformed purchase frequency
purchase_counts = train_interactions.groupby(['user_id', 'product_id']).size().reset_index(name='frequency')
purchase_counts['rating'] = np.log1p(purchase_counts['frequency'])  # log(1 + freq)

cf_data = purchase_counts[['user_id', 'product_id', 'rating']].copy()

print(f"\nRating statistics:")
print(f"  Min: {cf_data['rating'].min():.3f}")
print(f"  Max: {cf_data['rating'].max():.3f}")
print(f"  Mean: {cf_data['rating'].mean():.3f}")
print(f"  Median: {cf_data['rating'].median():.3f}")

# -----------------------------------------------
# Convert to Surprise format
# -----------------------------------------------
print("\nConverting to Surprise Dataset format...")

# Train SVD with defaults
reader = Reader(rating_scale=(cf_data['rating'].min(), cf_data['rating'].max()))

# Load data
surprise_data = Dataset.load_from_df(cf_data, reader)

# Build full trainset
trainset = surprise_data.build_full_trainset()

print(f" Trainset created:")
print(f"   Users: {trainset.n_users:,}")
print(f"   Items: {trainset.n_items:,}")
print(f"   Ratings: {trainset.n_ratings:,}")
print(f"   Sparsity: {1 - (trainset.n_ratings / (trainset.n_users * trainset.n_items)):.4f}")

# -----------------------------------------------
# Train SVD Model
# -----------------------------------------------
print("\n" + "-"*60)
print("Training SVD (Matrix Factorization) Model")
print("-"*60)

svd_model = SVD(random_state=SEED)  # All defaults
svd_model.fit(trainset)

print(" SVD model trained")

# -----------------------------------------------
# Generate Recommendations Function
# -----------------------------------------------
print("\n" + "-"*60)
print("Creating Recommendation Functions")
print("-"*60)

def get_top_n_recommendations(model, user_id, n=10, exclude_purchased=True):
    """
    Get top N product recommendations for a user
    
    Args:
        model: Trained Surprise model (SVD)
        user_id: Target user ID
        n: Number of recommendations
        exclude_purchased: Whether to exclude already purchased items
        
    Returns:
        List of (product_id, predicted_rating) tuples
    """
    # Get all product IDs
    all_products = cf_data['product_id'].unique()
    
    # Get products user has already purchased
    if exclude_purchased:
        purchased = cf_data[cf_data['user_id'] == user_id]['product_id'].unique()
        candidate_products = [p for p in all_products if p not in purchased]
    else:
        candidate_products = all_products
    
    # Predict ratings for all candidate products
    predictions = []
    for product_id in candidate_products:
        pred = model.predict(user_id, product_id)
        predictions.append((product_id, pred.est))
    
    # Sort by predicted rating (descending)
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return predictions[:n]

def get_recommendations_for_users(model, user_ids, n=10):
    """
    Get recommendations for multiple users
    
    Returns:
        Dictionary: {user_id: [(product_id, score), ...]}
    """
    recommendations = {}
    for user_id in user_ids:
        recs = get_top_n_recommendations(model, user_id, n=n)
        recommendations[user_id] = recs
    return recommendations

print(" Recommendation functions created:")
print(" - get_top_n_recommendations")
print(" - get_recommendations_for_users")

# -----------------------------------------------
# Test Recommendations
# -----------------------------------------------
print("\n" + "-"*60)
print("Testing Recommendations on Sample Users")
print("-"*60)

# Sample 3 random users
sample_users = cf_data['user_id'].sample(3, random_state=SEED).tolist()

print(f"\nSVD Recommendations:")
print("-" * 40)

for user_id in sample_users:
    recs = get_top_n_recommendations(svd_model, user_id, n=5)
    
    print(f"\nUser {user_id}:")
    print(f"  Past purchases: {cf_data[cf_data['user_id']==user_id]['product_id'].nunique()} products")
    print(f"  Top 5 recommendations:")
    
    for i, (prod_id, score) in enumerate(recs, 1):
        prod_name = products[products['product_id']==prod_id]['product_name'].values[0]
        print(f"    {i}. {prod_name} (score: {score:.3f})")

print("\n" + "="*60)
print("Section 2.1 Complete - CF Models Trained")
print("="*60)


2.1 - Collaborative Filtering

 Surprise library imported

Preparing user-item interaction data...
Training interactions: 29,014,490
Unique users: 175,072
Unique products: 49,623

Rating statistics:
  Min: 0.693
  Max: 4.595
  Mean: 1.040
  Median: 0.693

Converting to Surprise Dataset format...
 Trainset created:
   Users: 175,072
   Items: 49,623
   Ratings: 11,629,304
   Sparsity: 0.9987

------------------------------------------------------------
Training SVD (Matrix Factorization) Model
------------------------------------------------------------
 SVD model trained

------------------------------------------------------------
Creating Recommendation Functions
------------------------------------------------------------
 Recommendation functions created:
 - get_top_n_recommendations
 - get_recommendations_for_users

------------------------------------------------------------
Testing Recommendations on Sample Users
------------------------------------------------------------

SVD

In [None]:
# ================================================================
# 2.2 - Content-Based Filtering (CBF)
# ================================================================

print("\n" + "="*60)
print("2.2 - Content-Based Filtering")
print("="*60 + "\n")

from sklearn.metrics.pairwise import cosine_similarity

# -----------------------------------------------
# Build Item Profiles
# -----------------------------------------------
print("Building Item Profiles...")

# Merge product metadata
products_full = products.merge(departments, on='department_id', how='left')
products_full = products_full.merge(aisles, on='aisle_id', how='left')

print(f" Products with metadata: {len(products_full):,}")
print(f"   Departments: {products_full['department_id'].nunique()}")
print(f"   Aisles: {products_full['aisle_id'].nunique()}")

# Create feature vectors
# One-hot encode departments and aisles
dept_encoded = pd.get_dummies(products_full['department_id'], prefix='dept')
aisle_encoded = pd.get_dummies(products_full['aisle_id'], prefix='aisle')

# Combine features
item_profile = pd.concat([
    products_full[['product_id']],
    dept_encoded,
    aisle_encoded
], axis=1)

print(f"\nItem Profiles:")
print(f"  Products: {len(item_profile):,}")
print(f"  Features: {item_profile.shape[1] - 1}")

# -----------------------------------------------
# Create User Profiles (Weighted Item Features)
# -----------------------------------------------
print("\n" + "-"*60)
print("Creating User Profiles")
print("-"*60)

def create_user_profile(user_id, purchase_counts, item_features):
    """
    Create weighted user profile based on purchase history
    
    Args:
        user_id: Target user ID
        purchase_counts: DataFrame with [user_id, product_id, frequency]
        item_features: Items profiles (product_id + features)
        
    Returns:
        User profile vector (weighted average of item features)
    """
    # Get user's purchases
    user_purchases = purchase_counts[purchase_counts['user_id'] == user_id]
    
    if len(user_purchases) == 0:
        return None
    
    purchased_items = user_purchases['product_id'].values
    purchase_freqs = user_purchases['frequency'].values
    
    # Apply SAME log transformation as CF
    log_freqs = np.log1p(purchase_freqs)
    
    # Normalize to weights (sum to 1)
    weights = log_freqs / log_freqs.sum()
    
    # Get item feature vectors
    item_vectors = item_features.loc[
        item_features['product_id'].isin(purchased_items)
    ].drop('product_id', axis=1).values
    
    # Weighted average of item features
    user_profile = (item_vectors.T @ weights).reshape(1, -1)
    
    return user_profile

print(f" User profile function created")

# -----------------------------------------------
# CBF Recommendation Function
# -----------------------------------------------

def get_cbf_recommendations(user_id, n=10):
    """
    Get content-based recommendations using user profile
    
    Args:
        user_id: Target user ID
        n: Number of recommendations
        
    Returns:
        List of (product_id, score) tuples
    """
    # Create user profile
    user_profile = create_user_profile(user_id, purchase_counts, item_profile)
    
    if user_profile is None:
        return []
    
    # Get user's already purchased items
    user_products = purchase_counts[purchase_counts['user_id'] == user_id]['product_id'].values
    
    # Compute similarity between user profile and all items
    feature_matrix = item_profile.drop('product_id', axis=1).values
    similarities = cosine_similarity(user_profile, feature_matrix)[0]
    
    # Create product_id to index mapping
    idx_to_product = {idx: pid for idx, pid in enumerate(item_profile['product_id'])}

    # Get top N similar items (excluding already purchased)
    recommendations = []
    for idx, score in enumerate(similarities):
        product_id = idx_to_product[idx]
        if product_id not in user_products:
            recommendations.append((product_id, score))
    
    # Sort by score and return top N
    recommendations.sort(key=lambda x: x[1], reverse=True)
    return recommendations[:n]

print(" CBF recommendation function created")

# -----------------------------------------------
# Test CBF Recommendations
# -----------------------------------------------
print("\n" + "-"*60)
print("Testing Content-Based Recommendations")
print("-"*60)

# Test on same sample users as CF
print("\nCBF Recommendations:")
print("-" * 40)

for user_id in sample_users:
    recs = get_cbf_recommendations(user_id, n=5)
    
    print(f"\nUser {user_id}:")
    print(f"  Past purchases: {purchase_counts[purchase_counts['user_id']==user_id]['product_id'].nunique()} products")
    print(f"  Top 5 recommendations:")
    
    if len(recs) == 0:
        print("    (No recommendations - user not in training data)")
    else:
        for i, (prod_id, score) in enumerate(recs, 1):
            prod_name = products[products['product_id']==prod_id]['product_name'].values[0]
            print(f"    {i}. {prod_name} (score: {score:.3f})")

print("\n" + "="*60)
print("Section 2.2 Complete - CBF with User Profiles")
print("="*60)


2.2 - Content-Based Filtering

Building Item Profiles...
 Products with metadata: 49,688
   Departments: 21
   Aisles: 134

Item Profiles:
  Products: 49,688
  Features: 155

------------------------------------------------------------
Creating User Profiles
------------------------------------------------------------
 User profile function created
 CBF recommendation function created

------------------------------------------------------------
Testing Content-Based Recommendations
------------------------------------------------------------

CBF Recommendations:
----------------------------------------

User 132637:
  Past purchases: 28 products
  Top 5 recommendations:
    1. Organic Lemons (score: 0.691)
    2. Nectarines (score: 0.691)
    3. Cantaloupe (score: 0.691)
    4. Red Seedless Grapes Imported (score: 0.691)
    5. Mini Watermelon (score: 0.691)

User 160970:
  Past purchases: 52 products
  Top 5 recommendations:
    1. Organic Lemons (score: 0.655)
    2. Nectarines (s

In [64]:
# ================================================================
# 2.3 - Hybrid Recommendations
# ================================================================

# Normalization Strategy:
# - Retrieve top-400 candidates from both CF and CBF to ensure 
#   sufficient score variance (CBF assigns many tied scores due to 
#   limited product features). Normalized scores then combined via
#   weighted average.
# - Final recommendations: top-5 shown here; evaluation at k={5,10,20}
#   performed in Section 2.4
# ================================================================

print("\n" + "="*60)
print("2.3 - Hybrid Recommendations")
print("="*60 + "\n")

# -----------------------------------------------
# Hybrid Recommendation Function
# -----------------------------------------------
print("Creating hybrid recommendation system...")

def get_hybrid_recommendations(user_id, n=10, cf_weight=0.5, cbf_weight=0.5, debug=False):
    """
    Combine CF and CBF recommendations with weighted scoring
    
    Args:
        user_id: Target user ID
        n: Number of recommendations
        cf_weight: Weight for collaborative filtering (0-1)
        cbf_weight: Weight for content-based filtering (0-1)
        debug: Print diagnostic information
        
    Returns:
        List of (product_id, combined_score) tuples
    """
    # Get broader candidate set for better normalization (top-400)
    cf_recs = get_top_n_recommendations(svd_model, user_id, n=400, exclude_purchased=True)
    cbf_recs = get_cbf_recommendations(user_id, n=400)
    
    # Debug: Show raw score distributions
    if debug:
        print(f"\n{'='*60}")
        print(f"DIAGNOSTIC for User {user_id}")
        print(f"{'='*60}")
        
        if len(cf_recs) > 0:
            cf_raw = [score for _, score in cf_recs]
            print(f"\nCF Raw Scores (top {len(cf_recs)}):")
            print(f"  Min: {min(cf_raw):.4f}, Max: {max(cf_raw):.4f}")
            print(f"  Mean: {np.mean(cf_raw):.4f}, Std: {np.std(cf_raw):.4f}")
            print(f"  Range: {max(cf_raw) - min(cf_raw):.4f}")
        
        if len(cbf_recs) > 0:
            cbf_raw = [score for _, score in cbf_recs]
            print(f"\nCBF Raw Scores (top {len(cbf_recs)}):")
            print(f"  Min: {min(cbf_raw):.4f}, Max: {max(cbf_raw):.4f}")
            print(f"  Mean: {np.mean(cbf_raw):.4f}, Std: {np.std(cbf_raw):.4f}")
            print(f"  Range: {max(cbf_raw) - min(cbf_raw):.4f}")
    
    # Normalize scores to 0-1 range for fair combination
    # CF scores
    if len(cf_recs) > 0:
        cf_scores_dict = {pid: score for pid, score in cf_recs}
        cf_min = min(cf_scores_dict.values())
        cf_max = max(cf_scores_dict.values())
        cf_range = cf_max - cf_min if cf_max > cf_min else 1
        cf_scores_norm = {pid: (score - cf_min) / cf_range for pid, score in cf_scores_dict.items()}
    else:
        cf_scores_norm = {}
    
    # CBF scores
    if len(cbf_recs) > 0:
        cbf_scores_dict = {pid: score for pid, score in cbf_recs}
        cbf_min = min(cbf_scores_dict.values())
        cbf_max = max(cbf_scores_dict.values())
        cbf_range = cbf_max - cbf_min if cbf_max > cbf_min else 1
        cbf_scores_norm = {pid: (score - cbf_min) / cbf_range for pid, score in cbf_scores_dict.items()}
    else:
        cbf_scores_norm = {}
    
    # Debug: Show normalized distributions
    if debug:
        print(f"\nAfter Min-Max Normalization:")
        if len(cf_scores_norm) > 0:
            cf_norm_vals = list(cf_scores_norm.values())
            print(f"CF Normalized: Min={min(cf_norm_vals):.4f}, Max={max(cf_norm_vals):.4f}, Mean={np.mean(cf_norm_vals):.4f}")
        if len(cbf_scores_norm) > 0:
            cbf_norm_vals = list(cbf_scores_norm.values())
            print(f"CBF Normalized: Min={min(cbf_norm_vals):.4f}, Max={max(cbf_norm_vals):.4f}, Mean={np.mean(cbf_norm_vals):.4f}")
        print(f"{'='*60}\n")
    
    # Combine scores
    all_products = set(cf_scores_norm.keys()) | set(cbf_scores_norm.keys())
    
    hybrid_scores = {}
    for product_id in all_products:
        cf_score = cf_scores_norm.get(product_id, 0)
        cbf_score = cbf_scores_norm.get(product_id, 0)
        
        # Weighted combination
        hybrid_scores[product_id] = (cf_weight * cf_score) + (cbf_weight * cbf_score)
    
    # Sort by combined score and return top n
    recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:n]
    
    return recommendations

print(" Hybrid recommendation function created")

# -----------------------------------------------
# Test Different Weight Combinations
# -----------------------------------------------
print("\n" + "-"*60)
print("Testing Hybrid Recommendations with Different Weights")
print("-"*60)

# Test user
test_user = sample_users[0]

print(f"\nUser {test_user}:")
print(f"Past purchases: {cf_data[cf_data['user_id']==test_user]['product_id'].nunique()} products\n")

# Test different weight combinations
weight_configs = [
    (0.7, 0.3, "CF-Heavy"),
    (0.5, 0.5, "Balanced"),
    (0.3, 0.7, "CBF-Heavy")
]

for cf_w, cbf_w, label in weight_configs:
    print(f"{label} (CF={cf_w}, CBF={cbf_w}):")
    print("-" * 40)
    
    recs = get_hybrid_recommendations(test_user, n=5, cf_weight=cf_w, cbf_weight=cbf_w)
    
    for i, (prod_id, score) in enumerate(recs, 1):
        prod_name = products[products['product_id']==prod_id]['product_name'].values[0]
        print(f"  {i}. {prod_name} (score: {score:.3f})")
    print()

# -----------------------------------------------
# Compare All Three Approaches
# -----------------------------------------------
print("\n" + "-"*60)
print("Side-by-Side Comparison: CF vs CBF vs Hybrid")
print("-"*60)

comparison_user = sample_users[1]

print(f"\nUser {comparison_user}:")
print(f"Past purchases: {cf_data[cf_data['user_id']==comparison_user]['product_id'].nunique()} products\n")

# Get recommendations from all methods
cf_only = get_top_n_recommendations(svd_model, comparison_user, n=5)
cbf_only = get_cbf_recommendations(comparison_user, n=5)
hybrid = get_hybrid_recommendations(comparison_user, n=5, cf_weight=0.5, cbf_weight=0.5)

# Display side by side
print(f"{'CF Only':<50} | {'CBF Only':<50} | {'Hybrid (50/50)':<50}")
print("-" * 155)

for i in range(5):
    # CF
    if i < len(cf_only):
        cf_name = products[products['product_id']==cf_only[i][0]]['product_name'].values[0]
        cf_text = f"{i+1}. {cf_name[:40]}"
    else:
        cf_text = ""
    
    # CBF
    if i < len(cbf_only):
        cbf_name = products[products['product_id']==cbf_only[i][0]]['product_name'].values[0]
        cbf_text = f"{i+1}. {cbf_name[:40]}"
    else:
        cbf_text = ""
    
    # Hybrid
    if i < len(hybrid):
        hyb_name = products[products['product_id']==hybrid[i][0]]['product_name'].values[0]
        hyb_text = f"{i+1}. {hyb_name[:40]}"
    else:
        hyb_text = ""
    
    print(f"{cf_text:<50} | {cbf_text:<50} | {hyb_text:<50}")

print("\n" + "="*60)
print("Section 2.3 Complete - Hybrid System Built")
print("="*60)


2.3 - Hybrid Recommendations

Creating hybrid recommendation system...
 Hybrid recommendation function created

------------------------------------------------------------
Testing Hybrid Recommendations with Different Weights
------------------------------------------------------------

User 132637:
Past purchases: 28 products

CF-Heavy (CF=0.7, CBF=0.3):
----------------------------------------
  1. Half And Half Ultra Pasteurized (score: 0.700)
  2. Organic Lactose Free Whole Milk (score: 0.434)
  3. Bananas (score: 0.416)
  4. 1% Milkfat Low Fat Vitamin A & D Milk (score: 0.409)
  5. Organic Reduced Fat Milk (score: 0.397)

Balanced (CF=0.5, CBF=0.5):
----------------------------------------
  1. Bananas (score: 0.583)
  2. Bag of Organic Bananas (score: 0.521)
  3. Ataulfo Mango (score: 0.500)
  4. Organic Red Delicious Apples (score: 0.500)
  5. Mandarin Clementine  Bag (score: 0.500)

CBF-Heavy (CF=0.3, CBF=0.7):
----------------------------------------
  1. Bananas (score: 0.7

In [65]:
# ================================================================
# 2.4 - Evaluation of Global Models
# ================================================================
# 
# Goal: Compare 4 recommendation approaches on validation set:
#   1. Baseline (popularity-based)
#   2. Collaborative Filtering (SVD)
#   3. Content-Based Filtering (cosine similarity)
#   4. Hybrid (CF + CBF weighted combination)
#
# Metrics: Precision@K, Recall@K, F1@K for K ∈ {5, 10, 20}
# Best model selected for segment-specific training in Section 2.5
#
# Note: Due to computational constraints, evaluation performed on 
# stratified sample of 2,000 validation users (400 per segment)
# ================================================================

print("\n" + "="*60)
print("2.4 - Evaluation of Global Models")
print("="*60 + "\n")

# -----------------------------------------------
# Prepare Validation Ground Truth
# -----------------------------------------------
print("Preparing validation ground truth...")

# Merge orders with order_products to get user_id → product_id mapping
val_data = orders_val[['order_id', 'user_id', 'cluster']].merge(
    order_products_val[['order_id', 'product_id']], 
    on='order_id'
)

# Get validation ground truth (actual purchases per user)
val_ground_truth = val_data.groupby('user_id')['product_id'].apply(list).to_dict()

print(f" Total validation users: {len(val_ground_truth):,}")
print(f" Total validation purchases: {len(val_data):,}")

# -----------------------------------------------
# Stratified Sampling for Computational Efficiency
# -----------------------------------------------
print("\nPerforming stratified sampling...")

SAMPLE_SIZE = 2000
USERS_PER_SEGMENT = SAMPLE_SIZE // 5

# Sample equal number of users from each segment
val_users_df = orders_val[['user_id', 'cluster']].drop_duplicates()
val_users_sampled = (
    val_users_df.groupby('cluster', group_keys=False)
    .apply(lambda x: x.sample(min(len(x), USERS_PER_SEGMENT), random_state=SEED))
)

# Filter to users with ground truth
val_users = [u for u in val_users_sampled['user_id'].values if u in val_ground_truth]

print(f" Sampled {len(val_users):,} users for evaluation")
print(f"  Users per segment: ~{USERS_PER_SEGMENT}")
print(f"  Segment distribution:")
for cluster_id in range(5):
    count = sum(val_users_df[val_users_df['user_id'].isin(val_users)]['cluster'] == cluster_id)
    segment_name = user_features_clustered[user_features_clustered['cluster']==cluster_id]['segment_name'].iloc[0]
    print(f"    Segment {cluster_id} ({segment_name}): {count} users")

# -----------------------------------------------
# Cache User Purchase Histories
# -----------------------------------------------
print("\nCaching user purchase histories for faster processing...")

user_purchase_cache = {}
for user_id in tqdm(val_users, desc="Building cache"):
    user_purchase_cache[user_id] = set(train_interactions[train_interactions['user_id']==user_id]['product_id'])

print(f" Cached purchase histories for {len(user_purchase_cache):,} users")

# -----------------------------------------------
# Baseline: Popularity-Based Recommendations
# -----------------------------------------------
print("\n" + "-"*60)
print("Creating Baseline Model (Popularity-Based)")
print("-"*60)

# Calculate global popularity from training data
global_popularity = (train_interactions
                     .groupby('product_id')
                     .size()
                     .reset_index(name='frequency')
                     .sort_values('frequency', ascending=False))

print(f" Most popular products computed from training data")

def get_baseline_recommendations(user_id, n=10):
    """
    Baseline: Recommend most popular products globally
    (excluding user's past purchases)
    
    Args:
        user_id: Target user ID
        n: Number of recommendations
        
    Returns:
        List of product_ids
    """
    # Get user's purchase history from cache
    user_purchases = user_purchase_cache.get(user_id, set())
    
    # Filter out already purchased items
    recommendations = global_popularity[~global_popularity['product_id'].isin(user_purchases)]
    
    return list(recommendations['product_id'].head(n))

print(" Baseline recommendation function created")

# -----------------------------------------------
# Evaluation Metrics
# -----------------------------------------------
print("\n" + "-"*60)
print("Defining Evaluation Metrics")
print("-"*60)

def precision_at_k(recommended, actual, k):
    """Precision@K: Proportion of recommended items that were purchased"""
    rec_k = recommended[:k]
    relevant = set(rec_k) & set(actual)
    return len(relevant) / k if k > 0 else 0

def recall_at_k(recommended, actual, k):
    """Recall@K: Proportion of purchased items that were recommended"""
    rec_k = recommended[:k]
    relevant = set(rec_k) & set(actual)
    return len(relevant) / len(actual) if len(actual) > 0 else 0

def f1_at_k(recommended, actual, k):
    """F1@K: Harmonic mean of Precision@K and Recall@K"""
    prec = precision_at_k(recommended, actual, k)
    rec = recall_at_k(recommended, actual, k)
    return 2 * (prec * rec) / (prec + rec) if (prec + rec) > 0 else 0

print(" Evaluation metrics defined: Precision@K, Recall@K, F1@K")

# -----------------------------------------------
# Performance Profiling (First 20 Users)
# -----------------------------------------------
print("\n" + "-"*60)
print("Profiling Performance on First 20 Users")
print("-"*60)

import time

profile_users = val_users[:20]
timing_results = {
    'baseline': [],
    'cf': [],
    'cbf': [],
    'hybrid': []
}

for user_id in tqdm(profile_users, desc="Profiling"):
    actual = val_ground_truth[user_id]
    
    # Time each model
    start = time.time()
    baseline_recs = get_baseline_recommendations(user_id, n=20)
    timing_results['baseline'].append(time.time() - start)
    
    start = time.time()
    cf_recs = [pid for pid, _ in get_top_n_recommendations(svd_model, user_id, n=20, exclude_purchased=True)]
    timing_results['cf'].append(time.time() - start)
    
    start = time.time()
    cbf_recs = [pid for pid, _ in get_cbf_recommendations(user_id, n=20)]
    timing_results['cbf'].append(time.time() - start)
    
    start = time.time()
    hybrid_recs = [pid for pid, _ in get_hybrid_recommendations(user_id, n=20, cf_weight=0.5, cbf_weight=0.5)]
    timing_results['hybrid'].append(time.time() - start)

# Display timing results
print("\nTiming Summary (average per user):")
print("-" * 40)
for model_name, times in timing_results.items():
    avg_time = np.mean(times)
    print(f"  {model_name.upper():12s}: {avg_time:.3f}s")

total_avg = sum(np.mean(times) for times in timing_results.values())
print(f"\n  Total per user: {total_avg:.3f}s")
print(f"  Estimated time for {len(val_users):,} users: {(total_avg * len(val_users) / 60):.1f} minutes")

# -----------------------------------------------
# Evaluate All Models on Validation Set
# -----------------------------------------------
print("\n" + "-"*60)
print("Evaluating Models on Full Validation Sample")
print("-"*60)

# Initialize results storage
results = {
    'Baseline': {'P@5': [], 'R@5': [], 'F1@5': [], 
                 'P@10': [], 'R@10': [], 'F1@10': [],
                 'P@20': [], 'R@20': [], 'F1@20': []},
    'CF': {'P@5': [], 'R@5': [], 'F1@5': [], 
           'P@10': [], 'R@10': [], 'F1@10': [],
           'P@20': [], 'R@20': [], 'F1@20': []},
    'CBF': {'P@5': [], 'R@5': [], 'F1@5': [], 
            'P@10': [], 'R@10': [], 'F1@10': [],
            'P@20': [], 'R@20': [], 'F1@20': []},
    'Hybrid': {'P@5': [], 'R@5': [], 'F1@5': [], 
               'P@10': [], 'R@10': [], 'F1@10': [],
               'P@20': [], 'R@20': [], 'F1@20': []}
}

print(f"Evaluating on {len(val_users):,} sampled validation users...\n")

for user_id in tqdm(val_users, desc="Processing users"):
    actual = val_ground_truth[user_id]
    
    # Generate recommendations from all models
    baseline_recs = get_baseline_recommendations(user_id, n=20)
    cf_recs = [pid for pid, _ in get_top_n_recommendations(svd_model, user_id, n=20, exclude_purchased=True)]
    cbf_recs = [pid for pid, _ in get_cbf_recommendations(user_id, n=20)]
    hybrid_recs = [pid for pid, _ in get_hybrid_recommendations(user_id, n=20, cf_weight=0.5, cbf_weight=0.5)]
    
    # Evaluate at K = 5, 10, 20
    for k in [5, 10, 20]:
        for model_name, recs in [('Baseline', baseline_recs), 
                                  ('CF', cf_recs), 
                                  ('CBF', cbf_recs), 
                                  ('Hybrid', hybrid_recs)]:
            results[model_name][f'P@{k}'].append(precision_at_k(recs, actual, k))
            results[model_name][f'R@{k}'].append(recall_at_k(recs, actual, k))
            results[model_name][f'F1@{k}'].append(f1_at_k(recs, actual, k))

print(" Evaluation complete")

# -----------------------------------------------
# Aggregate Results
# -----------------------------------------------
print("\n" + "-"*60)
print("Global Model Performance (Validation Set)")
print("-"*60 + "\n")

# Compute mean metrics across sampled users
results_df = pd.DataFrame({
    'Model': [],
    'P@5': [], 'R@5': [], 'F1@5': [],
    'P@10': [], 'R@10': [], 'F1@10': [],
    'P@20': [], 'R@20': [], 'F1@20': []
})

for model_name in ['Baseline', 'CF', 'CBF', 'Hybrid']:
    row = {'Model': model_name}
    for metric in ['P@5', 'R@5', 'F1@5', 'P@10', 'R@10', 'F1@10', 'P@20', 'R@20', 'F1@20']:
        # Average metric across sampled validation users
        row[metric] = np.mean(results[model_name][metric])
    results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)

# Display results
print(results_df.to_string(index=False))

# Identify best model based on average F1 score
best_f1_avg = results_df[['F1@5', 'F1@10', 'F1@20']].mean(axis=1)
best_model = results_df.loc[best_f1_avg.idxmax(), 'Model']

print(f"\n Best performing model: {best_model}")
print(f"  (Selected based on average F1 score across K ∈ {{5, 10, 20}})")

print("\n" + "="*60)
print("Section 2.4 Complete - Global Models Evaluated")
print("="*60)
print(f"\nNote: Results based on stratified sample of {len(val_users):,} users")
print("      (computational constraints, representative across all segments)")


2.4 - Evaluation of Global Models

Preparing validation ground truth...
 Total validation users: 175,072
 Total validation purchases: 1,830,111

Performing stratified sampling...
 Sampled 2,000 users for evaluation
  Users per segment: ~400
  Segment distribution:
    Segment 0 (Power Users): 400 users
    Segment 1 (Routine Snackers): 400 users
    Segment 2 (Bulk Shoppers): 400 users
    Segment 3 (Alcohol Enthusiasts): 400 users
    Segment 4 (Household Essentials): 400 users

Caching user purchase histories for faster processing...


Building cache: 100%|██████████| 2000/2000 [00:42<00:00, 47.41it/s]


 Cached purchase histories for 2,000 users

------------------------------------------------------------
Creating Baseline Model (Popularity-Based)
------------------------------------------------------------
 Most popular products computed from training data
 Baseline recommendation function created

------------------------------------------------------------
Defining Evaluation Metrics
------------------------------------------------------------
 Evaluation metrics defined: Precision@K, Recall@K, F1@K

------------------------------------------------------------
Profiling Performance on First 20 Users
------------------------------------------------------------


Profiling: 100%|██████████| 20/20 [00:26<00:00,  1.34s/it]



Timing Summary (average per user):
----------------------------------------
  BASELINE    : 0.002s
  CF          : 0.406s
  CBF         : 0.261s
  HYBRID      : 0.669s

  Total per user: 1.339s
  Estimated time for 2,000 users: 44.6 minutes

------------------------------------------------------------
Evaluating Models on Full Validation Sample
------------------------------------------------------------
Evaluating on 2,000 sampled validation users...



Processing users: 100%|██████████| 2000/2000 [2:09:33<00:00,  3.89s/it]     

 Evaluation complete

------------------------------------------------------------
Global Model Performance (Validation Set)
------------------------------------------------------------

   Model    P@5      R@5     F1@5    P@10     R@10    F1@10     P@20     R@20    F1@20
Baseline 0.0126 0.007857 0.008421 0.01055 0.014319 0.010418 0.007800 0.018921 0.009896
      CF 0.0003 0.000141 0.000192 0.00045 0.000496 0.000450 0.000475 0.001133 0.000624
     CBF 0.0006 0.000154 0.000234 0.00055 0.000419 0.000426 0.000600 0.001073 0.000692
  Hybrid 0.0017 0.001512 0.001304 0.00125 0.002029 0.001342 0.000975 0.002888 0.001319

 Best performing model: Baseline
  (Selected based on average F1 score across K ∈ {5, 10, 20})

Section 2.4 Complete - Global Models Evaluated

Note: Results based on stratified sample of 2,000 users
      (computational constraints, representative across all segments)





In [69]:
# ================================================================
# 2.5 - Train Segment-Specific Models
# ================================================================
#
# Train specialized models for each customer segment:
#   1. Segment-Specific Baseline (popularity per segment)
#   2. Segment-Specific CF (5 SVD models, one per segment)
#   3. Segment-Specific Hybrid (segment CF + global CBF)
#
# Models saved for evaluation in Phase 3 (Notebook 04)
# ================================================================

print("\n" + "="*60)
print("2.5 - Train Segment-Specific Models")
print("="*60 + "\n")

print(f"Best global model from Section 2.4: {best_model}")
print("Training segment-specific models for all approaches...\n")

# -----------------------------------------------
# Cache Training User Purchase Histories
# -----------------------------------------------
print("-"*60)
print("Caching Training User Purchase Histories")
print("-"*60)

train_user_purchase_cache = {}
all_train_users = train_interactions['user_id'].unique()

for user_id in tqdm(all_train_users, desc="Building training cache"):
    train_user_purchase_cache[user_id] = set(train_interactions[train_interactions['user_id']==user_id]['product_id'])

print(f" Cached purchase histories for {len(train_user_purchase_cache):,} training users")

# -----------------------------------------------
# 1. Segment-Specific Baseline (Popularity per Segment)
# -----------------------------------------------
print("\n" + "-"*60)
print("1. Creating Segment-Specific Baseline Models")
print("-"*60)

segment_popularity = {}

for cluster_id in range(5):
    segment_name = user_features_clustered[user_features_clustered['cluster']==cluster_id]['segment_name'].iloc[0]
    
    print(f"\nSegment {cluster_id}: {segment_name}")
    
    # Filter training data for this segment
    segment_users = user_features_clustered[user_features_clustered['cluster']==cluster_id]['user_id'].values
    segment_train = train_interactions[train_interactions['user_id'].isin(segment_users)]
    
    # Calculate popularity within this segment
    segment_pop = (segment_train
                   .groupby('product_id')
                   .size()
                   .reset_index(name='frequency')
                   .sort_values('frequency', ascending=False))
    
    segment_popularity[cluster_id] = segment_pop
    
    print(f"  Users: {len(segment_users):,}")
    print(f"  Interactions: {len(segment_train):,}")
    print(f"  Unique products: {len(segment_pop):,}")
    
    # Show top 3 popular products in this segment
    top_3 = segment_pop.head(3)
    print(f"  Top 3 popular products:")
    for idx, row in top_3.iterrows():
        prod_name = products[products['product_id']==row['product_id']]['product_name'].values[0]
        print(f"    - {prod_name} ({row['frequency']} purchases)")

print(f"\n All {len(segment_popularity)} segment-specific baseline models created")

# -----------------------------------------------
# 2. Segment-Specific CF Models
# -----------------------------------------------
print("\n" + "-"*60)
print("2. Training Segment-Specific CF Models")
print("-"*60)

from surprise import SVD, Dataset, Reader

segment_cf_models = {}

for cluster_id in range(5):
    segment_name = user_features_clustered[user_features_clustered['cluster']==cluster_id]['segment_name'].iloc[0]
    
    print(f"\nTraining CF model for Segment {cluster_id}: {segment_name}")
    print("-" * 40)
    
    # Filter training data for this segment
    segment_users = user_features_clustered[user_features_clustered['cluster']==cluster_id]['user_id'].values
    
    # Get segment interactions and calculate ratings (same as global CF)
    segment_train = train_interactions[train_interactions['user_id'].isin(segment_users)]
    segment_purchase_counts = segment_train.groupby(['user_id', 'product_id']).size().reset_index(name='frequency')
    
    # Apply log transformation (same as global CF)
    segment_purchase_counts['rating'] = np.log1p(segment_purchase_counts['frequency'])  # log(1 + freq)
    
    # Prepare data for Surprise
    segment_cf_data = segment_purchase_counts[['user_id', 'product_id', 'rating']].copy()
    
    print(f"  Users: {len(segment_users):,}")
    print(f"  Interactions: {len(segment_cf_data):,}")
    print(f"  Rating statistics:")
    print(f"    Min: {segment_cf_data['rating'].min():.3f}")
    print(f"    Max: {segment_cf_data['rating'].max():.3f}")
    print(f"    Mean: {segment_cf_data['rating'].mean():.3f}")
    
    # Build dataset (same as global CF)
    reader = Reader(rating_scale=(segment_cf_data['rating'].min(), segment_cf_data['rating'].max()))
    segment_data = Dataset.load_from_df(segment_cf_data, reader)
    segment_trainset = segment_data.build_full_trainset()
    
    # Train SVD model (same parameters as global CF)
    segment_model = SVD(random_state=SEED)
    segment_model.fit(segment_trainset)
    
    # Store model
    segment_cf_models[cluster_id] = segment_model
    
    print(f"  Model trained")

print(f"\n All {len(segment_cf_models)} segment-specific CF models trained")

# -----------------------------------------------
# 3. Segment-Specific Recommendation Functions
# -----------------------------------------------
print("\n" + "-"*60)
print("3. Creating Segment-Specific Recommendation Functions")
print("-"*60)

def get_segment_baseline_recommendations(user_id, cluster_id, n=10):
    """
    Segment-specific baseline: Recommend popular products within segment
    (excluding user's past purchases)
    
    Args:
        user_id: Target user ID
        cluster_id: User's assigned segment
        n: Number of recommendations
        
    Returns:
        List of product_ids
    """
    # Get user's purchase history from training cache
    user_purchases = train_user_purchase_cache.get(user_id, set())
    
    # Get segment-specific popularity
    segment_pop = segment_popularity[cluster_id]
    
    # Filter out already purchased items
    recommendations = segment_pop[~segment_pop['product_id'].isin(user_purchases)]
    
    return list(recommendations['product_id'].head(n))

def get_segment_cf_recommendations(user_id, cluster_id, n=10, exclude_purchased=True):
    """
    Segment-specific CF recommendations
    
    Args:
        user_id: Target user ID
        cluster_id: User's assigned segment
        n: Number of recommendations
        exclude_purchased: Whether to exclude items user already purchased
        
    Returns:
        List of (product_id, score) tuples
    """
    # Use the segment-specific CF model
    segment_model = segment_cf_models[cluster_id]
    
    # Get all product IDs from training data
    all_products = train_interactions['product_id'].unique()
    
    # Get products user has already purchased
    if exclude_purchased:
        purchased = train_user_purchase_cache.get(user_id, set())
        candidate_products = [p for p in all_products if p not in purchased]
    else:
        candidate_products = all_products
    
    # Predict ratings for all candidate products
    predictions = []
    for product_id in candidate_products:
        pred = segment_model.predict(user_id, product_id)
        predictions.append((product_id, pred.est))
    
    # Sort by predicted rating (descending)
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return predictions[:n]

def get_segment_hybrid_recommendations(user_id, cluster_id, n=10, cf_weight=0.5, cbf_weight=0.5):
    """
    Segment-specific hybrid: Segment CF + Global CBF
    
    Args:
        user_id: Target user
        cluster_id: User's assigned segment
        n: Number of recommendations
        cf_weight, cbf_weight: Weights for CF and CBF
    
    Returns:
        List of (product_id, score) tuples
    """
    # Get segment-specific CF recommendations
    cf_recs = get_segment_cf_recommendations(user_id, cluster_id, n=400, exclude_purchased=True)
    
    # Get global CBF recommendations (content features don't change per segment)
    cbf_recs = get_cbf_recommendations(user_id, n=400)
    
    # Normalize CF scores
    if len(cf_recs) > 0:
        cf_scores_dict = {pid: score for pid, score in cf_recs}
        cf_min, cf_max = min(cf_scores_dict.values()), max(cf_scores_dict.values())
        cf_range = cf_max - cf_min if cf_max > cf_min else 1
        cf_scores_norm = {pid: (score - cf_min) / cf_range for pid, score in cf_scores_dict.items()}
    else:
        cf_scores_norm = {}
    
    # Normalize CBF scores
    if len(cbf_recs) > 0:
        cbf_scores_dict = {pid: score for pid, score in cbf_recs}
        cbf_min, cbf_max = min(cbf_scores_dict.values()), max(cbf_scores_dict.values())
        cbf_range = cbf_max - cbf_min if cbf_max > cbf_min else 1
        cbf_scores_norm = {pid: (score - cbf_min) / cbf_range for pid, score in cbf_scores_dict.items()}
    else:
        cbf_scores_norm = {}
    
    # Combine
    all_products = set(cf_scores_norm.keys()) | set(cbf_scores_norm.keys())
    hybrid_scores = {
        pid: (cf_weight * cf_scores_norm.get(pid, 0)) + (cbf_weight * cbf_scores_norm.get(pid, 0))
        for pid in all_products
    }
    
    return sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)[:n]

print(" Segment-specific recommendation functions created:")
print("  - get_segment_baseline_recommendations()")
print("  - get_segment_cf_recommendations()")
print("  - get_segment_hybrid_recommendations()")

# -----------------------------------------------
# Test Segment-Specific Recommendations
# -----------------------------------------------
print("\n" + "-"*60)
print("Testing Segment-Specific Recommendations")
print("-"*60)

# Test on a sample user from each segment
print("\nSample recommendations for one user per segment:\n")

for cluster_id in range(5):
    segment_name = user_features_clustered[user_features_clustered['cluster']==cluster_id]['segment_name'].iloc[0]
    
    # Get a sample user from this segment
    segment_users = user_features_clustered[user_features_clustered['cluster']==cluster_id]['user_id'].values
    sample_user = segment_users[0]
    
    print(f"Segment {cluster_id} ({segment_name}) - User {sample_user}:")
    print("-" * 40)
    
    # Get recommendations from segment-specific models
    baseline_recs = get_segment_baseline_recommendations(sample_user, cluster_id, n=3)
    cf_recs = [pid for pid, _ in get_segment_cf_recommendations(sample_user, cluster_id, n=3)]
    hybrid_recs = [pid for pid, _ in get_segment_hybrid_recommendations(sample_user, cluster_id, n=3)]
    
    print("  Baseline:")
    for i, pid in enumerate(baseline_recs, 1):
        prod_name = products[products['product_id']==pid]['product_name'].values[0]
        print(f"    {i}. {prod_name}")
    
    print("  CF:")
    for i, pid in enumerate(cf_recs, 1):
        prod_name = products[products['product_id']==pid]['product_name'].values[0]
        print(f"    {i}. {prod_name}")
    
    print("  Hybrid:")
    for i, pid in enumerate(hybrid_recs, 1):
        prod_name = products[products['product_id']==pid]['product_name'].values[0]
        print(f"    {i}. {prod_name}")
    
    print()

print("="*60)
print("Segment-Specific Models Trained & Tested")
print("="*60)


2.5 - Train Segment-Specific Models

Best global model from Section 2.4: Baseline
Training segment-specific models for all approaches...

------------------------------------------------------------
Caching Training User Purchase Histories
------------------------------------------------------------


Building training cache: 100%|██████████| 175072/175072 [1:15:07<00:00, 38.84it/s]


 Cached purchase histories for 175,072 training users

------------------------------------------------------------
1. Creating Segment-Specific Baseline Models
------------------------------------------------------------

Segment 0: Power Users
  Users: 78,862
  Interactions: 18,402,176
  Unique products: 43,969
  Top 3 popular products:
    - Banana (300299 purchases)
    - Bag of Organic Bananas (274348 purchases)
    - Organic Strawberries (205765 purchases)

Segment 1: Routine Snackers
  Users: 23,533
  Interactions: 2,579,429
  Unique products: 36,598
  Top 3 popular products:
    - Banana (31337 purchases)
    - Bag of Organic Bananas (26537 purchases)
    - Soda (22661 purchases)

Segment 2: Bulk Shoppers
  Users: 63,000
  Interactions: 7,310,155
  Unique products: 48,046
  Top 3 popular products:
    - Banana (88853 purchases)
    - Bag of Organic Bananas (38467 purchases)
    - Strawberries (27246 purchases)

Segment 3: Alcohol Enthusiasts
  Users: 1,906
  Interactions: 129,2

In [72]:
# ================================================================
# FINAL SAVE CHECKLIST
# ================================================================

import os

all_data = {
    # -----------------------------------------------
    # 1. TRAINED MODELS
    # -----------------------------------------------
    'global_cf_model': svd_model,
    'segment_cf_models': segment_cf_models,
    
    # -----------------------------------------------
    # 2. FEATURES & DATA STRUCTURES
    # -----------------------------------------------
    'item_profile': item_profile,
    'global_popularity': global_popularity,
    'segment_popularity': segment_popularity,
    'purchase_counts': purchase_counts,
    'train_interactions': train_interactions,
    'train_user_purchase_cache': train_user_purchase_cache,
    
    # -----------------------------------------------
    # 3. EVALUATION RESULTS
    # -----------------------------------------------
    'validation_results_df': results_df,
    'validation_raw_results': results,
    'best_model_name': best_model,
    'sample_size': len(val_users),
    
    # -----------------------------------------------
    # 4. METADATA
    # -----------------------------------------------
    'segment_names': {
        cluster_id: user_features_clustered[user_features_clustered['cluster']==cluster_id]['segment_name'].iloc[0]
        for cluster_id in range(5)
    },
    'n_clusters': 5,
    'n_train_users': train_interactions['user_id'].nunique(),
    'n_train_interactions': len(train_interactions),
    'n_products': train_interactions['product_id'].nunique(),
    'seed': SEED,
    
}

with open('../data/processed/phase2_models.pkl', 'wb') as f:
    pickle.dump(all_data, f)

print(" Models saved to: ../data/processed/phase2_models.pkl")
print(f" Total size: {os.path.getsize('../data/processed/phase2_models.pkl') / 1024 / 1024:.1f} MB")

 Models saved to: ../data/processed/phase2_models.pkl
 Total size: 2505.5 MB
