## ⚠️ Proper Evaluation with Train/Test Split

In [None]:
# Proper train/test split evaluation
from matchmaker import matchmaker
import cudf
import numpy as np
from tqdm import tqdm

# 1. Load original data and split by timestamp
print("Loading original interaction data...")
raw_data = cudf.read_csv("data/swipes_clean.csv")

# Sort by timestamp and split 80/20
raw_data = raw_data.sort_values('timestamp')
split_idx = int(len(raw_data) * 0.8)

train_data = raw_data.iloc[:split_idx]
test_data = raw_data.iloc[split_idx:]

print(f"Train set: {len(train_data):,} interactions")
print(f"Test set:  {len(test_data):,} interactions")

# Save splits temporarily
train_data.to_csv("/tmp/train_split.csv", index=False)
test_data.to_csv("/tmp/test_split.csv", index=False)

# 2. Build a NEW engine on ONLY the training data
print("\n🔄 Training model on 80% of data...")
engine_test = matchmaker.MatchingEngine()
engine_test.load_interactions("/tmp/train_split.csv",
    decider_col='decidermemberid',
    other_col='othermemberid', 
    like_col='like', 
    timestamp_col='timestamp',
    gender_col='decidergender')
engine_test.run_engagement()
engine_test.run_popularity()
engine_test.build_recommender()

print("✅ Training complete on train set")

  from .autonotebook import tqdm as notebook_tqdm


Loading original interaction data...
Train set: 7,862,310 interactions
Test set:  1,965,578 interactions


In [None]:
# 3. Evaluate on HELD-OUT test set
print("\n📊 Evaluating on held-out test set (20% of data)...\n")

# Get test set likes (ground truth)
test_likes_df = test_data[test_data['like'] == 1][['decidermemberid', 'othermemberid']].to_pandas()
test_likes = test_likes_df.groupby('decidermemberid')['othermemberid'].apply(list).to_dict()

# Get users who have likes in BOTH train and test (so we can recommend)
user_df_test = engine_test.user_df
test_users_sample = user_df_test[user_df_test.gender=='M'].dropna().sample(min(1000, len(user_df_test))).user_id.to_arrow().to_pylist()
test_users_valid = [u for u in test_users_sample if u in test_likes]

print(f"Test users with held-out likes: {len(test_users_valid)}")

if len(test_users_valid) == 0:
    print("⚠️ No test users found - need users who appear in both train and test sets")
else:
    # Generate recommendations using model trained on train set
    recs_batch = engine_test.recommend_batch(test_users_valid, k=100)
    
    hits = 0
    all_recs = []
    
    for user_id in tqdm(test_users_valid, desc="Evaluating"):
        # Extract user IDs from recommendations
        recs = [rec[0] for rec in recs_batch[user_id]]
        all_recs.extend(recs)
        
        # Check if any recommended users were actually liked IN THE TEST SET (unseen data)
        actual = set(test_likes[user_id])
        recommended = set(recs)
        
        if len(actual & recommended) > 0:
            hits += 1
    
    # Calculate metrics
    hit_rate = hits / len(test_users_valid)
    personalization = len(set(all_recs)) / len(all_recs) if len(all_recs) > 0 else 0
    
    print(f"\n{'='*70}")
    print("📊 HELD-OUT TEST SET EVALUATION (k=100)")
    print(f"{'='*70}")
    print(f"Hit Rate:        {hit_rate:.2%} ({hits}/{len(test_users_valid)} users)")
    print(f"Personalization: {personalization:.2%}")
    print(f"Unique recs:     {len(set(all_recs)):,}")
    print(f"{'='*70}")
    print("\n✅ This is the TRUE performance on unseen data!")

In [None]:
recs