## ⚠️ Proper Evaluation with Train/Test Split

In [1]:
# Proper train/test split evaluation
from matchmaker import MatchingEngine
import cudf
import numpy as np
from tqdm import tqdm

# 1. Load original data and split by timestamp
print("Loading original interaction data...")
raw_data = cudf.read_csv("data/swipes_clean.csv")

# Sort by timestamp and split 80/20
raw_data = raw_data.sort_values('timestamp')
split_idx = int(len(raw_data) * 0.8)

train_data = raw_data.iloc[:split_idx]
test_data = raw_data.iloc[split_idx:]

print(f"Train set: {len(train_data):,} interactions")
print(f"Test set:  {len(test_data):,} interactions")

# Save splits temporarily
train_data.to_csv("/tmp/train_split.csv", index=False)
test_data.to_csv("/tmp/test_split.csv", index=False)

# 2. Build a NEW engine on ONLY the training data
print("\n🔄 Training model on 80% of data...")
engine_test = MatchingEngine()
engine_test.load_interactions("/tmp/train_split.csv",
    decider_col='decidermemberid',
    other_col='othermemberid', 
    like_col='like', 
    timestamp_col='timestamp',
    gender_col='decidergender')
engine_test.run_engagement()
engine_test.run_elo()
engine_test.build_recommender()
print("✅ Training complete on train set")

  from .autonotebook import tqdm as notebook_tqdm


Loading original interaction data...
Train set: 7,862,310 interactions
Test set:  1,965,578 interactions
Train set: 7,862,310 interactions
Test set:  1,965,578 interactions

🔄 Training model on 80% of data...
Reading data... 
🔄 Training model on 80% of data...
Reading data... ✅
Fitting ALS... 
🚀 Preparing data...
✅
Fitting ALS... 
🚀 Preparing data...
🎯 Training male→female ALS...
🎯 Training male→female ALS...


100%|██████████| 15/15 [00:00<00:00, 16.93it/s]
100%|██████████| 15/15 [00:00<00:00, 16.93it/s]


🎯 Training female→male ALS...


100%|██████████| 15/15 [00:00<00:00, 309.95it/s]



🔄 Converting factors to CuPy arrays...
✅ Trained M2F ALS with 31134 males × 32994 females
✅ Trained F2M ALS with 9925 females × 38446 males
Complete! ✅
User DF updated ✅
User DF updated ✅
User DF updated ✅
Building FAISS recommender (pop)... User DF updated ✅
Building FAISS recommender (pop)... ✅
✅ Training complete on train set
✅
✅ Training complete on train set


---

In [None]:
def evaluate_mutual_compatibility(engine_test, test_data, gender='M', k=100):
    """
    Evaluate mutual compatibility on held-out test set.
    
    Parameters:
    -----------
    engine_test : MatchingEngine
        The trained recommendation engine
    test_data : cudf.DataFrame
        Test set interactions
    gender : str
        'M' for males viewing females, 'F' for females viewing males
    k : int
        Number of recommendations to generate per user
        
    Returns:
    --------
    dict : Evaluation results including metrics and recommendations
    """
    from tqdm import tqdm
    import cudf
    
    print(f"\n📊 Evaluating MUTUAL compatibility - {gender} users viewing {'females' if gender=='M' else 'males'}...\n")
    
    # ⚡ KEEP ON GPU: Filter likes on GPU
    test_likes = test_data[test_data['like'] == 1][['decidermemberid', 'othermemberid']]
    
    # ⚡ KEEP ON GPU: Get gender mapping on GPU
    user_genders_df = engine_test.user_df[['user_id', 'gender']].rename(columns={'user_id': 'decidermemberid'})
    
    # ⚡ KEEP ON GPU: Merge to get genders
    test_likes_with_gender = test_likes.merge(user_genders_df, on='decidermemberid', how='left')
    
    # Build gender-specific like dictionaries
    if gender == 'M':
        # Males viewing females
        male_likes = test_likes_with_gender[test_likes_with_gender['gender'] == 'M']
        my_likes = male_likes.groupby('decidermemberid')['othermemberid'].agg(list).to_pandas()
        my_likes = {k: set(v) for k, v in my_likes.items()}
        
        # Females who liked males (for mutual check)
        female_likes = test_likes_with_gender[test_likes_with_gender['gender'] == 'F']
        their_likes = female_likes.groupby('decidermemberid')['othermemberid'].agg(list).to_pandas()
        their_likes = {k: set(v) for k, v in their_likes.items()}
        
        label = "MALES VIEWING FEMALES"
        opposite_label = "female"
    else:
        # Females viewing males
        female_likes = test_likes_with_gender[test_likes_with_gender['gender'] == 'F']
        my_likes = female_likes.groupby('decidermemberid')['othermemberid'].agg(list).to_pandas()
        my_likes = {k: set(v) for k, v in my_likes.items()}
        
        # Males who liked females (for mutual check)
        male_likes = test_likes_with_gender[test_likes_with_gender['gender'] == 'M']
        their_likes = male_likes.groupby('decidermemberid')['othermemberid'].agg(list).to_pandas()
        their_likes = {k: set(v) for k, v in their_likes.items()}
        
        label = "FEMALES VIEWING MALES"
        opposite_label = "male"
    
    # Calculate total mutual matches in test set
    total_mutual_matches_in_test = 0
    for user_id in my_likes:
        for other_id in my_likes[user_id]:
            # Check if mutual
            if user_id in their_likes.get(other_id, set()):
                total_mutual_matches_in_test += 1
    
    # ⚡ KEEP ON GPU: Filter valid users on GPU
    user_df_test = engine_test.user_df
    valid_users_gpu = user_df_test[user_df_test['user_id'].isin(list(my_likes.keys()))]
    test_users_valid = valid_users_gpu['user_id'].to_arrow().to_pylist()
    
    print(f"Test users ({gender}) with held-out likes: {len(test_users_valid):,}")
    print(f"Opposite gender users who liked someone: {len(their_likes):,}")
    print(f"Total mutual matches in test set: {total_mutual_matches_in_test:,}")
    
    if len(test_users_valid) == 0:
        print("⚠️ No test users found")
        return None
    
    # Generate recommendations for ALL users
    print(f"Generating recommendations for all {len(test_users_valid):,} users...")
    recs_batch = engine_test.recommend_batch(test_users_valid, k=k)
    
    hits = 0
    mutual_hits = 0
    all_recs = []
    mutual_matches = []
    
    for user_id in tqdm(test_users_valid, desc=f"Evaluating {gender}"):
        # Extract user IDs from recommendations
        recs = [rec[0] for rec in recs_batch[user_id]]
        all_recs.extend(recs)
        
        # Get who this user liked in test set
        actual_likes = my_likes.get(user_id, set())
        recommended = set(recs)
        
        # One-sided hit (user liked someone we recommended)
        if len(actual_likes & recommended) > 0:
            hits += 1
            
            # Check for MUTUAL compatibility
            for other_id in (actual_likes & recommended):
                # Did the other person ALSO like this user in the test set?
                if user_id in their_likes.get(other_id, set()):
                    mutual_hits += 1
                    mutual_matches.append((user_id, other_id))
                    break  # Count once per user
    
    # Calculate metrics
    hit_rate = hits / len(test_users_valid)
    mutual_hit_rate = mutual_hits / len(test_users_valid)
    personalization = len(set(all_recs)) / len(all_recs) if len(all_recs) > 0 else 0
    recall_of_matches = mutual_hits / total_mutual_matches_in_test if total_mutual_matches_in_test > 0 else 0
    
    print(f"\n{'='*70}")
    print(f"📊 HELD-OUT TEST SET EVALUATION (k={k}) - {label}")
    print(f"{'='*70}")
    print(f"One-Sided Hit Rate:    {hit_rate:.2%} ({hits:,}/{len(test_users_valid):,} users)")
    print(f"   ↳ User liked someone we recommended")
    print(f"\nMUTUAL Match Rate:     {mutual_hit_rate:.2%} ({mutual_hits:,}/{len(test_users_valid):,} users)")
    print(f"   ↳ Both users liked each other (TRUE compatibility!)")
    print(f"\nMatch Recall:          {recall_of_matches:.2%} ({mutual_hits:,}/{total_mutual_matches_in_test:,} matches)")
    print(f"   ↳ Proportion of ALL mutual matches we found in top-{k}")
    print(f"\nPersonalization:       {personalization:.2%}")
    print(f"Unique recs:           {len(set(all_recs)):,}")
    print(f"{'='*70}")
    
    if mutual_hits > 0:
        print(f"\n✅ Found {mutual_hits:,} mutual matches out of {total_mutual_matches_in_test:,} total in test set!")
        print(f"💡 Match recall of {recall_of_matches:.1%} means we found {recall_of_matches:.1%} of all possible matches in top-{k}!")
    
    return {
        'gender': gender,
        'test_users_valid': test_users_valid,
        'all_recs': all_recs,
        'hits': hits,
        'mutual_hits': mutual_hits,
        'mutual_matches': mutual_matches,
        'hit_rate': hit_rate,
        'mutual_hit_rate': mutual_hit_rate,
        'personalization': personalization,
        'total_mutual_matches_in_test': total_mutual_matches_in_test,
        'recall_of_matches': recall_of_matches
    }

In [None]:
def analyse_recommendation_coverage(engine, results_dict, opposite_gender='F'):
    """
    Analyse which users are being recommended and correlate with popularity/engagement.
    
    Parameters:
    -----------
    engine : MatchingEngine
        The trained recommendation engine
    results_dict : dict
        Results from evaluate_mutual_compatibility function
    opposite_gender : str
        'F' to analyse females being recommended (to males)
        'M' to analyse males being recommended (to females)
    """
    import pandas as pd
    import matplotlib.pyplot as plt
    from collections import Counter
    from scipy.stats import pearsonr, spearmanr
    
    all_recs = results_dict['all_recs']
    test_users_valid = results_dict['test_users_valid']
    
    # Get all potential candidates of specified gender
    user_df_test = engine.user_df
    candidates = user_df_test[user_df_test.gender == opposite_gender].copy().to_pandas()
    
    # Count how many times each candidate was recommended
    rec_counts_dict = Counter(all_recs)
    candidates['times_recommended'] = candidates['user_id'].map(lambda x: rec_counts_dict.get(x, 0))
    
    # Get candidates who were NEVER recommended
    never_recommended = candidates[candidates['times_recommended'] == 0]
    recommended_users = candidates[candidates['times_recommended'] > 0]
    
    gender_label = "Female" if opposite_gender == 'F' else "Male"
    
    print(f"\n📊 RECOMMENDATION COVERAGE ANALYSIS ({gender_label} Users)")
    print(f"{'='*70}")
    print(f"Total {gender_label.lower()} users available: {len(candidates):,}")
    print(f"{gender_label}s recommended at least once: {len(recommended_users):,}")
    print(f"{gender_label}s NEVER recommended: {len(never_recommended):,} ({len(never_recommended)/len(candidates)*100:.1f}%)")
    
    # Check if any users were actually recommended
    if len(recommended_users) == 0:
        print(f"\n⚠️  WARNING: No {gender_label.lower()}s were recommended in this evaluation!")
        return candidates, recommended_users, never_recommended
    
    print(f"{'='*70}\n")
    
    # Create scatter plots
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # 1. Times Recommended vs elo_rating (Popularity)
    axes[0].scatter(candidates['elo_rating'], candidates['times_recommended'], 
                    alpha=0.5, s=20, c=candidates['times_recommended'], cmap='viridis')
    axes[0].set_xlabel('elo_rating Score (Popularity)', fontsize=12)
    axes[0].set_ylabel('Times Recommended', fontsize=12)
    axes[0].set_title(f'{gender_label} Recommendation Frequency vs Popularity Score', fontsize=14)
    axes[0].grid(True, alpha=0.3)
    axes[0].set_xscale('log')
    
    # Add correlation
    valid_mask = ~candidates['elo_rating'].isna() & ~candidates['times_recommended'].isna()
    if valid_mask.sum() > 0:
        pearson_corr, _ = pearsonr(candidates[valid_mask]['elo_rating'], 
                                    candidates[valid_mask]['times_recommended'])
        spearman_corr, _ = spearmanr(candidates[valid_mask]['elo_rating'], 
                                      candidates[valid_mask]['times_recommended'])
        axes[0].text(0.05, 0.95, f'Pearson: {pearson_corr:.3f}\nSpearman: {spearman_corr:.3f}', 
                    transform=axes[0].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    # 2. Times Recommended vs Engagement Score
    axes[1].scatter(candidates['engagement_score'], candidates['times_recommended'], 
                    alpha=0.5, s=20, c=candidates['times_recommended'], cmap='plasma')
    axes[1].set_xlabel('Engagement Score', fontsize=12)
    axes[1].set_ylabel('Times Recommended', fontsize=12)
    axes[1].set_title(f'{gender_label} Recommendation Frequency vs Engagement Score', fontsize=14)
    axes[1].grid(True, alpha=0.3)
    
    # Add correlation
    valid_mask = ~candidates['engagement_score'].isna() & ~candidates['times_recommended'].isna()
    if valid_mask.sum() > 0:
        pearson_corr, _ = pearsonr(candidates[valid_mask]['engagement_score'], 
                                    candidates[valid_mask]['times_recommended'])
        spearman_corr, _ = spearmanr(candidates[valid_mask]['engagement_score'], 
                                      candidates[valid_mask]['times_recommended'])
        axes[1].text(0.05, 0.95, f'Pearson: {pearson_corr:.3f}\nSpearman: {spearman_corr:.3f}', 
                    transform=axes[1].transAxes, verticalalignment='top',
                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    plt.tight_layout()
    plt.savefig(f'recommendation_coverage_{opposite_gender}.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    # Detailed stats on never-recommended users
    if len(never_recommended) > 0:
        print(f"\n📉 NEVER-RECOMMENDED {gender_label.upper()}S ANALYSIS:")
        print(f"   • Avg elo_rating: {never_recommended['elo_rating'].mean():.6f} (vs {candidates['elo_rating'].mean():.6f} overall)")
        print(f"   • Avg Engagement: {never_recommended['engagement_score'].mean():.2f} (vs {candidates['engagement_score'].mean():.2f} overall)")
        print(f"   • League distribution:")
        for league in ['Bronze', 'Silver', 'Gold', 'Platinum', 'Diamond']:
            count = len(never_recommended[never_recommended['league'] == league])
            pct = count / len(never_recommended) * 100 if len(never_recommended) > 0 else 0
            print(f"      - {league}: {count} ({pct:.1f}%)")
    
    # Compare recommended vs never-recommended users
    print(f"\n📊 COMPARISON: Recommended vs Never-Recommended {gender_label}s")
    print(f"{'='*70}")
    
    print(f"\n{'Metric':<25} {'Recommended':<20} {'Never Recommended':<20}")
    print(f"{'-'*65}")
    print(f"{'Count':<25} {len(recommended_users):<20} {len(never_recommended):<20}")
    print(f"{'Avg elo_rating':<25} {recommended_users['elo_rating'].mean():<20.6f} {never_recommended['elo_rating'].mean():<20.6f}")
    print(f"{'Median elo_rating':<25} {recommended_users['elo_rating'].median():<20.6f} {never_recommended['elo_rating'].median():<20.6f}")
    print(f"{'Avg Engagement':<25} {recommended_users['engagement_score'].mean():<20.2f} {never_recommended['engagement_score'].mean():<20.2f}")
    print(f"{'Median Engagement':<25} {recommended_users['engagement_score'].median():<20.2f} {never_recommended['engagement_score'].median():<20.2f}")
    
    return candidates, recommended_users, never_recommended

In [None]:
# Evaluate ALL males viewing females (no sampling)
results_males = evaluate_mutual_compatibility(engine_test, test_data, gender='M', k=100)

In [None]:
# Evaluate females viewing males
results_females = evaluate_mutual_compatibility(engine_test, test_data, gender='F', k=100)

---

In [None]:
def test_case_statistics(target_id: int,
                         interactions: "cudf.DataFrame",
                         engine,
                         rec_k: int = 30) -> None:
    
    import pandas as pd
    import numpy as np

    print(f"### Test case statistics for user {target_id} ###\n")

    decider_df = interactions[interactions["decidermemberid"] == target_id]
    total_decisions = len(decider_df)
    like_rate = float(decider_df["like"].mean()) if total_decisions else 0.0

    likes_given = decider_df[decider_df["like"] == 1]
    likes_received = interactions[
        (interactions["othermemberid"] == target_id) & (interactions["like"] == 1)
    ]

    matches = likes_given.merge(
        likes_received,
        left_on="othermemberid",
        right_on="decidermemberid",
        how="inner",
    )
    match_rate = len(matches) / total_decisions if total_decisions else 0.0
    match_rate_given_likes = len(matches) / len(likes_given) if len(likes_given) else 0.0

    print(f"Like rate: {like_rate:.2%}")
    print(f"Match rate: {match_rate:.2%}")
    print(f"Match rate given likes: {match_rate_given_likes:.2%}\n")

    recs = engine.recommend_batch([target_id], k=rec_k).get(target_id, [])
    if not recs:
        print("No recommendations available.\n")
        return

    rec_df = pd.DataFrame(recs, columns=["candidate_id", "score"])
    liked_ids = set(likes_given["othermemberid"].to_pandas().tolist())
    rec_df["label"] = rec_df["candidate_id"].apply(lambda cid: 1 if cid in liked_ids else 0)
    
    # 🔍 DIAGNOSTIC INFO
    print(f"DEBUG INFO:")
    print(f"  Total people user liked in dataset: {len(liked_ids)}")
    print(f"  People with label=1 in top-{rec_k}: {rec_df['label'].sum()}")
    print(f"  Hit rate in top-{rec_k}: {rec_df['label'].sum() / len(liked_ids) * 100:.1f}%\n")
    
    print(rec_df.nlargest(5, "score"), end="\n\n")

    dcg = 0.0
    for rank, rel in enumerate(rec_df.nlargest(rec_k, "score")["label"], start=1):
        dcg += (2 ** rel - 1) / np.log2(rank + 1)

    ideal_labels = rec_df.nlargest(rec_k, "label")["label"]
    idcg = 0.0
    for rank, rel in enumerate(ideal_labels, start=1):
        idcg += (2 ** rel - 1) / np.log2(rank + 1)

    ndcg_30 = dcg / idcg if idcg > 0 else 0.0
    print(f"NDCG@{rec_k} for user {target_id}: {ndcg_30:.4f}\n")

In [None]:
test_case_statistics(1142425, test_data, engine_test)

In [None]:
test_case_statistics(336132, test_data, engine_test)

---

In [2]:
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, log_loss
from scipy.special import expit
import pandas as pd
import numpy as np
from tqdm import tqdm

def summarize_classification_pairs(pairs_df):
    """Add calibrated probabilities, print metrics, and return summary stats."""
    if pairs_df is None or len(pairs_df) == 0:
        print("⚠️ No pairs generated!")
        return None, {}

    pairs_df = pairs_df.copy()
    scores = pairs_df['score'].values
    z = (scores - scores.mean()) / (scores.std() + 1e-12)  # standardize
    pairs_df['proba'] = expit(z)  # sigmoid -> (0, 1)

    metrics = {
        'num_pairs': int(len(pairs_df)),
        'positive_pairs': int(pairs_df['label'].sum()),
        'negative_pairs': int((1 - pairs_df['label']).sum()),
        'positive_rate': float(pairs_df['label'].mean()),
    }

    try:
        metrics['auc_micro'] = float(roc_auc_score(pairs_df['label'], pairs_df['score']))
        metrics['ap_micro'] = float(average_precision_score(pairs_df['label'], pairs_df['score']))
        metrics['brier'] = float(brier_score_loss(pairs_df['label'], pairs_df['proba']))
        metrics['logloss'] = float(log_loss(pairs_df['label'], pairs_df['proba']))
    except Exception as e:
        print(f"❌ Error computing metrics: {e}")
        print(f"Label distribution: {pairs_df['label'].value_counts()}")
        return pairs_df, metrics

    print(f"\n{'='*70}")
    print("### ML (ALS RECOMMENDER) PERFORMANCE ON TEST SET ###")
    print(f"{'='*70}")
    print(f"Total pairs evaluated:             {metrics['num_pairs']:,}")
    print(f"Positive pairs (likes):            {metrics['positive_pairs']:,} ({metrics['positive_rate']:.2%})")
    print(f"Negative pairs (no like):          {metrics['negative_pairs']:,}")
    print(f"\nMicro ROC AUC:                     {metrics['auc_micro']:.4f}")
    print(f"Micro Average Precision (PR-AUC):  {metrics['ap_micro']:.4f}")
    print(f"Brier score:                       {metrics['brier']:.4f}")
    print(f"Log loss:                          {metrics['logloss']:.4f}")
    print(f"{'='*70}\n")

    return pairs_df, metrics

def evaluate_classification_metrics(engine, test_data, sample_size=None, k=100):
    """
    Evaluate classification metrics (AUC, AP, Brier, Log Loss) on test set.

    Parameters:
    -----------
    engine : MatchingEngine
        The trained recommendation engine
    test_data : cudf.DataFrame
        Test set interactions
    sample_size : int, optional
        Number of users to sample for evaluation (None = all users)
    k : int, optional
        Number of recommendations generated per user

    Returns:
    --------
    Tuple[pd.DataFrame | None, dict]: pairs with labels/scores and summary metrics
    """
    print("\n📊 CLASSIFICATION METRICS EVALUATION ON TEST SET\n")

    test_users = test_data['decidermemberid'].unique().to_pandas().tolist()

    if sample_size and sample_size < len(test_users):
        test_users = np.random.choice(test_users, size=sample_size, replace=False).tolist()
        print(f"Sampled {sample_size:,} users from test set")
    else:
        print(f"Evaluating all {len(test_users):,} users in test set")

    test_likes = test_data[test_data['like'] == 1][['decidermemberid', 'othermemberid']]
    ground_truth = test_likes.groupby('decidermemberid')['othermemberid'].agg(list).to_pandas()
    ground_truth = {k: set(v) for k, v in ground_truth.items()}

    print(f"Generating recommendations for {len(test_users):,} users...")
    recs_batch = engine.recommend_batch(test_users, k=k)

    pairs_list = []
    for user_id in tqdm(test_users, desc="Building pairs"):
        recs = recs_batch.get(user_id, [])
        for candidate_id, score in recs:
            label = 1 if candidate_id in ground_truth.get(user_id, set()) else 0
            pairs_list.append({
                'user': user_id,
                'item': candidate_id,
                'score': score,
                'label': label
            })

    if not pairs_list:
        print("⚠️ No pairs generated!")
        return None, {}

    pairs_df = pd.DataFrame(pairs_list)
    return summarize_classification_pairs(pairs_df)

In [None]:
impressions_df = (
    test_data[['decidermemberid', 'othermemberid']]
    .rename(columns={'decidermemberid': 'user', 'othermemberid': 'item'})
    .drop_duplicates()
    .to_pandas()
 )

likes_by_user = (
    test_data[test_data['like'] == 1][['decidermemberid', 'othermemberid']]
    .rename(columns={'decidermemberid': 'user', 'othermemberid': 'item'})
    .groupby('user')['item']
    .agg(list)
    .to_pandas()
    .apply(set)
    .to_dict()
 )

def evaluate_classification_metrics_seen_only(pairs_df):
    """Restrict evaluation to impressions the user actually saw under the logged policy."""
    if pairs_df is None or len(pairs_df) == 0:
        print('⚠️ No recommendation pairs supplied.')
        return None, {}

    filtered_pairs = pairs_df.merge(impressions_df, on=['user', 'item'], how='inner')

    if filtered_pairs.empty:
        print('⚠️ No recommendations overlapped with held-out impressions.')
        return None, {}

    return summarize_classification_pairs(filtered_pairs)

def matched_hits_at_k(pairs_df, k=100):
    """Compute off-policy Matched Hits@K (precision-style metric) using only seen impressions."""
    if pairs_df is None or len(pairs_df) == 0:
        print('⚠️ No recommendation pairs supplied.')
        return 0.0, pd.DataFrame(), pd.DataFrame()

    ranked = pairs_df.sort_values(['user', 'score'], ascending=[True, False])
    topk = ranked.groupby('user', as_index=False).head(k)

    if topk.empty:
        print('⚠️ No recommendations available for matched hits calculation.')
        return 0.0, topk, pd.DataFrame()

    seen_topk = topk.merge(impressions_df, on=['user', 'item'], how='inner')

    if seen_topk.empty:
        print('⚠️ None of the recommended pairs were observed under the logged policy.')
        return 0.0, seen_topk, pd.DataFrame()

    seen_topk['matched_like'] = seen_topk.apply(
        lambda row: 1 if row['item'] in likes_by_user.get(row['user'], set()) else 0, axis=1
    )

    per_user = (
        seen_topk.groupby('user')['matched_like']
        .agg(['sum', 'count'])
        .rename(columns={'sum': 'matched_likes', 'count': 'items_returned'})
    )

    num_users = len(per_user)
    if num_users == 0:
        print('⚠️ No users with seen recommendations for matched hits calculation.')
        return 0.0, seen_topk, per_user

    total_items_returned = per_user['items_returned'].sum()
    matched_hits = per_user['matched_likes'].sum() / total_items_returned if total_items_returned else 0.0
    per_user['matched_hits_at_k'] = per_user['matched_likes'] / per_user['items_returned']

    print(f"\n{'='*70}")
    print("### OFF-POLICY MATCHED HITS@K (SEEN IMPRESSIONS) ###")
    print(f"{'='*70}")
    print(f"Users evaluated:                  {num_users:,}")
    print(f"Total seen recommendations:       {total_items_returned:,}")
    print(f"Matched Hits@{k} (micro):          {matched_hits:.4f}")
    print(f"Median per-user matched hits:     {per_user['matched_hits_at_k'].median():.4f}")
    print(f"{'='*70}\n")

    return matched_hits, seen_topk, per_user

def logged_policy_matched_hits(interactions):
    """Compute matched hit statistics for the historical logged policy."""
    df = interactions[['decidermemberid', 'othermemberid', 'like']].to_pandas()

    if df.empty:
        print('⚠️ Interaction data is empty.')
        return 0.0, pd.DataFrame()

    per_user = (
        df.groupby('decidermemberid')
        .agg(impressions=('othermemberid', 'count'), matched_likes=('like', 'sum'))
        .reset_index()
    )

    per_user['matched_hits_rate'] = per_user['matched_likes'] / per_user['impressions']

    total_impressions = per_user['impressions'].sum()
    total_matched_likes = per_user['matched_likes'].sum()

    micro_rate = total_matched_likes / total_impressions if total_impressions else 0.0
    median_rate = per_user['matched_hits_rate'].median() if not per_user.empty else 0.0

    print(f"\n{'='*70}")
    print("### LOGGED POLICY MATCHED HITS ###")
    print(f"{'='*70}")
    print(f"Users evaluated:                  {len(per_user):,}")
    print(f"Total logged impressions:         {total_impressions:,}")
    print(f"Total logged likes (matches):     {int(total_matched_likes):,}")
    print(f"Matched Hits (micro):             {micro_rate:.4f}")
    print(f"Median per-user matched hits:     {median_rate:.4f}")
    print(f"{'='*70}\n")

    return micro_rate, per_user

In [4]:
# generate the raw recommendation pairs
pairs_df, overall_metrics = evaluate_classification_metrics(engine_test, test_data, sample_size=40000, k=100)

print()

# re-score using only impressions the user actually saw
seen_pairs_df, seen_metrics = evaluate_classification_metrics_seen_only(pairs_df)

print()

# compute matched hits@K (off-policy precision style metric)
matched_hits, topk_matched_pairs, per_user_matched = matched_hits_at_k(pairs_df, k=100)


📊 CLASSIFICATION METRICS EVALUATION ON TEST SET

Evaluating all 30,327 users in test set
Generating recommendations for 30,327 users...
Generating recommendations for 30,327 users...


Building pairs: 100%|██████████| 30327/30327 [00:01<00:00, 25059.66it/s]




### ML (ALS RECOMMENDER) PERFORMANCE ON TEST SET ###
Total pairs evaluated:             2,442,961
Positive pairs (likes):            25,337 (1.04%)
Negative pairs (no like):          2,417,624

Micro ROC AUC:                     0.7418
Micro Average Precision (PR-AUC):  0.0252
Brier score:                       0.2530
Log loss:                          0.7781



### ML (ALS RECOMMENDER) PERFORMANCE ON TEST SET ###
Total pairs evaluated:             47,645
Positive pairs (likes):            25,337 (53.18%)
Negative pairs (no like):          22,308

Micro ROC AUC:                     0.7051
Micro Average Precision (PR-AUC):  0.6945
Brier score:                       0.2329
Log loss:                          0.6655



### ML (ALS RECOMMENDER) PERFORMANCE ON TEST SET ###
Total pairs evaluated:             47,645
Positive pairs (likes):            25,337 (53.18%)
Negative pairs (no like):          22,308

Micro ROC AUC:                     0.7051
Micro Average Precision (PR-AUC):  0.6945
B