# League vs. No-League Recommender Comparison

This notebook trains a fresh `MatchingEngine`, builds both the league-filtered and league-agnostic FAISS recommenders, and evaluates them on a held-out test split to compare mutual-match performance and coverage.

In [1]:
# Imports
from matchmaker import MatchingEngine
from matchmaker.serving import LeagueFilteredRecommender, ALSFaissRecommender
import cudf
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from typing import Dict, List, Tuple

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load data and create train/test split
DATA_PATH = 'data/swipes_clean.csv'
SPLIT_FRACTION = 0.8

print('Loading interaction data...')
raw_data = cudf.read_csv(DATA_PATH)
raw_data = raw_data.sort_values('timestamp')
split_idx = int(len(raw_data) * SPLIT_FRACTION)
train_data = raw_data.iloc[:split_idx]
test_data = raw_data.iloc[split_idx:]
print(f'Train rows: {len(train_data):,} | Test rows: {len(test_data):,}')

train_path = '/tmp/matchmaker_train_split.csv'
test_path = '/tmp/matchmaker_test_split.csv'
train_data.to_csv(train_path, index=False)
test_data.to_csv(test_path, index=False)

Loading interaction data...
Train rows: 7,862,310 | Test rows: 1,965,578


In [3]:
# Train a new MatchingEngine on the training split
engine = MatchingEngine()
engine.load_interactions(
    train_path,
    decider_col='decidermemberid',
    other_col='othermemberid',
    like_col='like',
    timestamp_col='timestamp',
    gender_col='decidergender'
)
engine.run_engagement()
engine.run_elo()
engine.build_recommender()

Reading data... ✅
Fitting ALS... 
🚀 Preparing data...
🎯 Training male→female ALS...


100%|██████████| 15/15 [00:00<00:00, 18.07it/s]


🎯 Training female→male ALS...


100%|██████████| 15/15 [00:00<00:00, 334.78it/s]


🔄 Converting factors to CuPy arrays...
✅ Trained M2F ALS with 31134 males × 32994 females
✅ Trained F2M ALS with 9925 females × 38446 males
Complete! ✅
User DF updated ✅
User DF updated ✅
Building FAISS recommender... ✅


In [4]:
# Instantiate both recommenders for comparison
league_recommender = engine.recommender
noleague_recommender = ALSFaissRecommender(als_model=engine.als_model, use_gpu=True)

# Prepare shared user metadata dictionary
if isinstance(engine.user_df, cudf.DataFrame):
    user_df_pd = engine.user_df.to_pandas()
else:
    user_df_pd = engine.user_df.copy()

user_metadata = {
    row['user_id']: {
        'gender': row['gender'],
        'league': row.get('league'),
    }
    for _, row in user_df_pd.iterrows()
}

In [5]:
def evaluate_mutual_compatibility(recommender, test_data, gender='M', k=100, max_users=None):
    """Evaluate mutual compatibility for a given recommender on the held-out test set."""
    import cudf

    rec_type = "league-filtered" if isinstance(recommender, LeagueFilteredRecommender) else "no-league"
    print(f"\nEvaluating {rec_type} recommender for {gender} users...")

    # Step 1: collect positive interactions from the test split on GPU
    test_likes = test_data[test_data['like'] == 1][['decidermemberid', 'othermemberid']]

    # Step 2: pull gender information from engine.user_df (stay in cuDF)
    if not isinstance(engine.user_df, cudf.DataFrame):
        user_df_gpu = cudf.DataFrame.from_pandas(engine.user_df[['user_id', 'gender']])
    else:
        user_df_gpu = engine.user_df[['user_id', 'gender']]

    gender_df = user_df_gpu.rename(columns={'user_id': 'decidermemberid'})
    test_likes_with_gender = test_likes.merge(gender_df, on='decidermemberid', how='left')

    # Step 3: build dictionaries of likes for deciders and receivers
    if gender == 'M':
        my_likes_series = (test_likes_with_gender[test_likes_with_gender['gender'] == 'M']
                           .groupby('decidermemberid')['othermemberid']
                           .agg(list))
        their_likes_series = (test_likes_with_gender[test_likes_with_gender['gender'] == 'F']
                              .groupby('decidermemberid')['othermemberid']
                              .agg(list))
        viewing_label = 'males viewing females'
    else:
        my_likes_series = (test_likes_with_gender[test_likes_with_gender['gender'] == 'F']
                           .groupby('decidermemberid')['othermemberid']
                           .agg(list))
        their_likes_series = (test_likes_with_gender[test_likes_with_gender['gender'] == 'M']
                              .groupby('decidermemberid')['othermemberid']
                              .agg(list))
        viewing_label = 'females viewing males'

    # Convert grouped lists to compact Python structures (only once per user)
    def _series_to_dict(list_series: cudf.Series) -> Dict[int, set]:
        if list_series.empty:
            return {}
        keys = list_series.index.values_host.tolist()
        values = list_series.to_arrow().to_pylist()
        return {int(k): set(map(int, v)) for k, v in zip(keys, values)}

    my_likes = _series_to_dict(my_likes_series)
    their_likes = _series_to_dict(their_likes_series)

    # Step 4: compute total mutual matches in the test set
    total_mutual_matches = 0
    for uid, liked_set in my_likes.items():
        for other_id in liked_set:
            if uid in their_likes.get(other_id, set()):
                total_mutual_matches += 1

    # Step 5: identify evaluation users using GPU set membership
    eval_user_ids_gpu = cudf.Series(list(my_likes.keys()), dtype='int64')
    valid_users_gpu = eval_user_ids_gpu[eval_user_ids_gpu.isin(user_df_gpu['user_id'])]
    eval_user_ids = valid_users_gpu.values_host.tolist()

    if max_users is not None:
        eval_user_ids = eval_user_ids[:max_users]

    if not eval_user_ids:
        print('No eligible users found in test set.')
        return None

    # Step 6: gather metadata for these users (metadata dict may be on CPU already)
    eval_metadata = {uid: user_metadata.get(uid, {}) for uid in eval_user_ids}

    # Step 7: generate recommendations and measure runtime
    start = time.perf_counter()
    recs_batch = recommender.recommend_batch(eval_user_ids, eval_metadata, k=k)
    runtime = time.perf_counter() - start

    # Step 8: compute metrics on CPU-sized outputs
    hits = 0
    mutual_hits = 0
    all_recs: List[int] = []
    mutual_pairs: List[Tuple[int, int]] = []

    for uid in tqdm(eval_user_ids, desc=f'Checking hits for {viewing_label}'):
        recs = recs_batch.get(uid, [])
        candidate_ids = [candidate for candidate, _ in recs]
        all_recs.extend(candidate_ids)
        actual_likes = my_likes.get(uid, set())
        recommended_set = set(candidate_ids)
        overlap = actual_likes & recommended_set
        if overlap:
            hits += 1
            for other_id in overlap:
                if uid in their_likes.get(other_id, set()):
                    mutual_hits += 1
                    mutual_pairs.append((uid, other_id))
                    break

    n_users = len(eval_user_ids)
    hit_rate = hits / n_users
    mutual_hit_rate = mutual_hits / n_users if n_users else 0
    personalization = len(set(all_recs)) / len(all_recs) if all_recs else 0
    recall_of_matches = mutual_hits / total_mutual_matches if total_mutual_matches else 0

    return {
        'gender': gender,
        'k': k,
        'users_evaluated': n_users,
        'hit_rate': hit_rate,
        'mutual_hit_rate': mutual_hit_rate,
        'match_recall': recall_of_matches,
        'personalization': personalization,
        'unique_recs': len(set(all_recs)),
        'total_recs': len(all_recs),
        'runtime_seconds': runtime,
        'mutual_pairs': mutual_pairs,
    }


In [6]:
# Run evaluations for both recommenders
K = 100
MAX_USERS = 20000  # cap evaluation users for practicality

league_results_m = evaluate_mutual_compatibility(league_recommender, test_data, gender='M', k=K, max_users=MAX_USERS)
noleague_results_m = evaluate_mutual_compatibility(noleague_recommender, test_data, gender='M', k=K, max_users=MAX_USERS)

league_results_f = evaluate_mutual_compatibility(league_recommender, test_data, gender='F', k=K, max_users=MAX_USERS)
noleague_results_f = evaluate_mutual_compatibility(noleague_recommender, test_data, gender='F', k=K, max_users=MAX_USERS)


Evaluating league-filtered recommender for M users...


Checking hits for males viewing females: 100%|██████████| 19123/19123 [00:00<00:00, 132766.03it/s]



Evaluating no-league recommender for M users...


Checking hits for males viewing females: 100%|██████████| 19123/19123 [00:00<00:00, 143786.71it/s]



Evaluating league-filtered recommender for F users...


Checking hits for females viewing males: 100%|██████████| 4725/4725 [00:00<00:00, 140938.64it/s]



Evaluating no-league recommender for F users...


Checking hits for females viewing males: 100%|██████████| 4725/4725 [00:00<00:00, 126775.71it/s]


In [7]:
# Summarize results side-by-side
def summarize_results(label, league_metrics, noleague_metrics):
    return {
        'segment': label,
        'league_hit_rate': league_metrics['hit_rate'],
        'noleague_hit_rate': noleague_metrics['hit_rate'],
        'league_mutual_rate': league_metrics['mutual_hit_rate'],
        'noleague_mutual_rate': noleague_metrics['mutual_hit_rate'],
        'league_recall': league_metrics['match_recall'],
        'noleague_recall': noleague_metrics['match_recall'],
        'league_personalization': league_metrics['personalization'],
        'noleague_personalization': noleague_metrics['personalization'],
        'league_runtime_sec': league_metrics['runtime_seconds'],
        'noleague_runtime_sec': noleague_metrics['runtime_seconds'],
    }

summary_df = pd.DataFrame([
    summarize_results('M→F', league_results_m, noleague_results_m),
    summarize_results('F→M', league_results_f, noleague_results_f),
]).assign(
    hit_rate_delta=lambda df: df['noleague_hit_rate'] - df['league_hit_rate'],
    mutual_rate_delta=lambda df: df['noleague_mutual_rate'] - df['league_mutual_rate'],
    recall_delta=lambda df: df['noleague_recall'] - df['league_recall'],
    personalization_delta=lambda df: df['noleague_personalization'] - df['league_personalization'],
    runtime_delta_sec=lambda df: df['noleague_runtime_sec'] - df['league_runtime_sec'],
)

summary_df.round(4)

Unnamed: 0,segment,league_hit_rate,noleague_hit_rate,league_mutual_rate,noleague_mutual_rate,league_recall,noleague_recall,league_personalization,noleague_personalization,league_runtime_sec,noleague_runtime_sec,hit_rate_delta,mutual_rate_delta,recall_delta,personalization_delta,runtime_delta_sec
0,M→F,0.4412,0.5195,0.0093,0.0132,0.0479,0.0679,0.0146,0.0101,4.2425,3.653,0.0783,0.0039,0.0199,-0.0046,-0.5895
1,F→M,0.0237,0.0322,0.0076,0.014,0.0097,0.0178,0.0321,0.0086,0.8435,0.4721,0.0085,0.0063,0.0081,-0.0234,-0.3713
