# Cross-Encoder Fine-Tuning & Evaluation Notebook

This notebook outlines the end-to-end process for fine-tuning a Sentence-Transformers `CrossEncoder`.

## 1. Install & Import Dependencies

In [2]:
import os
import random
import pandas as pd
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, roc_auc_score, average_precision_score

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import os
import boto3
from botocore.config import Config

def download_from_r2(object_name, local_path, bucket_name="bookdbio"):
    # ensure parent dir exists
    parent_dir = os.path.dirname(local_path)
    if parent_dir and not os.path.isdir(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)

    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )

    try:
        s3.download_file(bucket_name, object_name, local_path)
        print(f"Successfully downloaded {object_name} to {local_path}")
    except Exception as e:
        print(f"Download failed for {object_name}: {e}")

In [2]:
download_from_r2("data/training_pairs.parquet.zip", "data/training_pairs.parquet.zip")

Successfully downloaded data/training_pairs.parquet.zip to data/training_pairs.parquet.zip


In [10]:
import dask.dataframe as dd

# Define the base path to your Parquet directory
base_path = 'data/training_pairs.parquet/' # Make sure this ends with a slash

# List the specific parts you want to load
parts_to_load = [
    base_path + 'part.0.parquet',
    base_path + 'part.1.parquet',
    base_path + 'part.2.parquet',
    base_path + 'part.3.parquet',
    base_path + 'part.4.parquet',
    base_path + 'part.5.parquet',
    base_path + 'part.6.parquet',
    base_path + 'part.7.parquet',
    base_path + 'part.8.parquet',
    base_path + 'part.9.parquet',
    base_path + 'part.10.parquet',
    base_path + 'part.11.parquet',
    base_path + 'part.12.parquet',
]

# Load the specified parts
df_dd = dd.read_parquet(parts_to_load)

# You can then compute it to a Pandas DataFrame if needed for further processing
# or use Dask operations directly.
training_pairs_df = df_dd.compute()

print(f"Dask DataFrame loaded with {df_dd.npartitions} partitions.")
# print(df_pd.head())
# print(f"Shape of loaded data: {df_pd.shape}")

: 

In [3]:
training_pairs_df.head(20)

Unnamed: 0,user_id,book_id,user_ctx,book_text,label
0,001af7947e217e17694c5a9c097afffb,57854,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Tao Te Ching | Genres: history, literat...",1
1,001af7947e217e17694c5a9c097afffb,15808287,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Mrs. Lincoln's Dressmaker | Genres: bio...,0
2,001af7947e217e17694c5a9c097afffb,3692,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Heart of the Matter | Genres: conte...,0
3,001af7947e217e17694c5a9c097afffb,603515,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: The Hound of Rowan (The Tapestry, #1) |...",0
4,001af7947e217e17694c5a9c097afffb,34,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Fellowship of the Ring (The Lord of...,1
5,001af7947e217e17694c5a9c097afffb,73965,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Drinking: A Love Story | Genres: biogra...,0
6,001af7947e217e17694c5a9c097afffb,1215919,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Highlander Untamed (MacLeods of Skye Tr...,0
7,001af7947e217e17694c5a9c097afffb,218038,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: All About Love (Cynster, #6) | Genres: ...",0
8,001af7947e217e17694c5a9c097afffb,7332,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: The Silmarillion | Genres: adventure, a...",1
9,001af7947e217e17694c5a9c097afffb,455930,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Echo Burning (Jack Reacher, #5) | Genre...",0


In [4]:
print(f"Shape of loaded data: {training_pairs_df.shape}")

Shape of loaded data: (2862473, 5)


In [5]:
import pandas as pd
import dask.dataframe as dd

# Load the training pairs
training_pairs = dd.read_parquet('data/training_pairs.parquet/part.0.parquet')

# Convert to pandas for easier inspection
training_pairs_pd = training_pairs.compute()

# Display basic information
print("Shape of the dataset:", training_pairs_pd.shape)
print("\nColumns:", training_pairs_pd.columns.tolist())
print("\nSample of the data:")
print(training_pairs_pd.head())

# Check the distribution of labels
print("\nLabel distribution:")
print(training_pairs_pd['label'].value_counts())

# Check some example user contexts and book texts
print("\nExample user context:")
print(training_pairs_pd['user_ctx'].iloc[0])
print("\nExample book text:")
print(training_pairs_pd['book_text'].iloc[0])

Shape of the dataset: (936681, 5)

Columns: ['user_id', 'book_id', 'user_ctx', 'book_text', 'label']

Sample of the data:
                            user_id   book_id  \
0  001af7947e217e17694c5a9c097afffb     57854   
1  001af7947e217e17694c5a9c097afffb  15808287   
2  001af7947e217e17694c5a9c097afffb      3692   
3  001af7947e217e17694c5a9c097afffb    603515   
4  001af7947e217e17694c5a9c097afffb        34   

                                            user_ctx  \
0  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
1  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
2  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
3  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
4  Favorite books: Tao Te Ching by Lao Tzu and Gi...   

                                           book_text  label  
0  Title: Tao Te Ching | Genres: history, literat...      1  
1  Title: Mrs. Lincoln's Dressmaker | Genres: bio...      0  
2  Title: The Heart of the Matter | Genres: conte...      0 

In [7]:
# Count positive labels per user
positive_counts = training_pairs_df[training_pairs_df['label'] == 1].groupby('user_id').size()
print("\nNumber of positive examples per user:")
print(positive_counts)

# Get some statistics about the positive counts
print("\nStatistics about positive examples per user:")
print(positive_counts.describe())

# Find users with very few positive examples
users_with_few_positives = positive_counts[positive_counts < 5]
print(f"\nNumber of users with less than 5 positive examples: {len(users_with_few_positives)}")
print("\nExample users with few positives:")
print(users_with_few_positives.head())



Number of positive examples per user:
user_id
0005f52944ea1992e95d61f287acaea9     65
0006260f85929db85eddee3a0bd0e504     20
0006db397ebf02b2e891d1048fb70dbc    166
0006de2967df1ec4432c51090803966e     76
000883382802f2d95a3dd545bb953882    154
                                   ... 
1d364492146d00ceebf9b7ec4e7d45af    298
1d4a2185b490d26a3ab0faedc4adf6c7     42
1d6a5b005de5e4c27945dca1c13d47f2    121
1de44842d3080ec55181e46b0cf16ed1     78
1dfed2c58f01fe899666bd9a6ce319e1    155
Length: 5357, dtype: int64

Statistics about positive examples per user:
count    5357.000000
mean      136.083816
std       101.274552
min         1.000000
25%        77.000000
50%       109.000000
75%       165.000000
max      1478.000000
dtype: float64

Number of users with less than 5 positive examples: 30

Example users with few positives:
user_id
008c374625966c32477ebab37e835a4e    1
00e8157279aa30f4b919aea0a887f49a    2
01e2d286d0361edf8c62bc580d3baa18    1
02ac01d9ebc7165e80d8967f075adbd3    3
0378ae

In [None]:
import dask.dataframe as dd

# Get all unique user IDs
user_ids = training_pairs_pd['user_id'].unique()

print(user_ids)

<ArrowStringArray>
['001af7947e217e17694c5a9c097afffb', '0006260f85929db85eddee3a0bd0e504',
 '000bcda59ab565512f51f9e1f531b5e5', '0005f52944ea1992e95d61f287acaea9',
 '000883382802f2d95a3dd545bb953882', '0006db397ebf02b2e891d1048fb70dbc',
 '0009b61b9879bb2e5b84ce24f43450c8', '00281bdc3b8dd584ca6c5cb867de959f',
 '0006de2967df1ec4432c51090803966e', '002c10ebc541a4303b4d2c0aa2bff335',
 ...
 '090ebce33f677e84d5ee0e8510996a15', '093a06eb1563ef6d2a6c443b5189db47',
 '0a97f788f5707a7f116f5cc16875597e', '0951f343eed8911a4451ae2fa80dc1f3',
 '08c70632f3c4ca2793d221f9d47037fb', '096eb5757df185c7793fac23085a5b62',
 '08c0a7ae8992a65d7792dd9c69b41369', '090fd9cea80dbfb06cf11885af8a1e38',
 '093ef1f64c3baa83e54d9e63a550369d', '089c7ad67ccf2f81c8dc476db53f3235']
Length: 1785, dtype: string


In [9]:
import pandas as pd
import random

# Assuming 'df' is your DataFrame loaded with pd.read_parquet(data_path)
# and has columns: 'user_id', 'label', 'user_ctx', 'book_text'

# --- Configuration ---
MIN_POSITIVES_TO_KEEP_USER = 3
MAX_POSITIVES_TO_SAMPLE = 3  # Changed from 10 to 3
NEGATIVES_PER_POSITIVE = 3
RANDOM_SEED = 42 # For reproducibility

random.seed(RANDOM_SEED)
print(f"Original DataFrame shape: {training_pairs_df.shape}")

# --- Step 1: Identify positive and negative interactions ---
df_positives = training_pairs_df[training_pairs_df['label'] == 1]
df_negatives = training_pairs_df[training_pairs_df['label'] == 0]

# --- Step 2: Process each user ---
selected_positive_samples = []
users_to_process = df_positives['user_id'].unique()

for user_id in users_to_process:
    user_positive_df = df_positives[df_positives['user_id'] == user_id]
    num_user_positives = len(user_positive_df)

    if num_user_positives >= MAX_POSITIVES_TO_SAMPLE:
        # If >= 3 positives, sample 3
        selected_positive_samples.append(user_positive_df.sample(n=MAX_POSITIVES_TO_SAMPLE, random_state=RANDOM_SEED))
    elif num_user_positives >= MIN_POSITIVES_TO_KEEP_USER:
        # If exactly 3 positives, keep them all
        selected_positive_samples.append(user_positive_df)
    # Else (less than 3 positives), drop the user (do nothing here)

# Combine all selected positive samples
if selected_positive_samples:
    final_positives_df = pd.concat(selected_positive_samples).reset_index(drop=True)
else:
    final_positives_df = pd.DataFrame(columns=df.columns) # Empty DataFrame if no users meet criteria

print(f"Number of positive samples after filtering/sampling: {len(final_positives_df)}")
print(f"Number of unique users after filtering/sampling positives: {final_positives_df['user_id'].nunique()}")

# --- Step 3: Sample negatives for each selected positive ---
final_samples_list = []
if not final_positives_df.empty:
    for _, positive_row in final_positives_df.iterrows():
        user_id = positive_row['user_id']
        
        # Add the positive sample
        final_samples_list.append(positive_row.to_dict())
        
        # Get all negative samples for this user
        user_negative_df = df_negatives[df_negatives['user_id'] == user_id]
        
        if not user_negative_df.empty:
            num_negs_to_sample = min(NEGATIVES_PER_POSITIVE, len(user_negative_df))
            if num_negs_to_sample > 0:
                sampled_negatives = user_negative_df.sample(n=num_negs_to_sample, random_state=RANDOM_SEED)
                for _, neg_row in sampled_negatives.iterrows():
                    final_samples_list.append(neg_row.to_dict())

# Create the final DataFrame
processed_df = pd.DataFrame(final_samples_list)

if not processed_df.empty:
    # Ensure correct dtypes, especially for label
    processed_df['label'] = processed_df['label'].astype(int)
    print(f"Final processed DataFrame shape: {processed_df.shape}")
    print(f"Label distribution in final DataFrame:\n{processed_df['label'].value_counts(normalize=True)}")
    print(f"Number of unique users in final DataFrame: {processed_df['user_id'].nunique()}")
else:
    print("No samples met the criteria. The processed DataFrame is empty.")


Original DataFrame shape: (2862473, 5)
Number of positive samples after filtering/sampling: 16017
Number of unique users after filtering/sampling positives: 5339
Final processed DataFrame shape: (64068, 5)
Label distribution in final DataFrame:
label
0    0.75
1    0.25
Name: proportion, dtype: float64
Number of unique users in final DataFrame: 5339


## 2. Configuration

In [3]:
# Paths
data_path = 'data/processed_training_pairs_parts_0_to_12.parquet'
output_model_dir = 'reranker_model'

In [4]:
random_seed = 42

# Reproducibility
random.seed(random_seed)

## 3. Load & Split Data

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

# 1) Load all pairs and drop book_id
df = pd.read_parquet(data_path)
df = df.drop('book_id', axis=1)

# 2) Get unique users
users = df['user_id'].unique()

# 3) First split: 80% train, 20% temp (val+test)
train_users, temp_users = train_test_split(
    users, 
    test_size=0.2, 
    random_state=random_seed
)

# 4) Second split: half of temp → val (10%), half → test (10%)
val_users, test_users = train_test_split(
    temp_users, 
    test_size=0.5, 
    random_state=random_seed
)

# 5) Build DataFrames
train_df = df[df['user_id'].isin(train_users)].reset_index(drop=True)
val_df   = df[df['user_id'].isin(val_users)].reset_index(drop=True)
test_df  = df[df['user_id'].isin(test_users)].reset_index(drop=True)

print(f"Train pairs: {len(train_df)}, Val pairs: {len(val_df)}, Test pairs: {len(test_df)}")

Train pairs: 222360, Val pairs: 27792, Test pairs: 27804


In [6]:
train_df.head()

Unnamed: 0,user_id,user_ctx,book_text,label
0,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Down and Out in Paris and London | Genr...,1
1,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Virgin Suicides | Genres: coming-of...,0
2,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Seducing Cinderella (Fighting for Love,...",0
3,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Highlander Untamed (MacLeods of Skye Tr...,0
4,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: A Woman in Berlin: Eight Weeks in the C...,1


## 4. Prepare InputExamples & DataLoaders

In [7]:
# Convert to InputExample
train_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in train_df.itertuples()
]

val_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in val_df.itertuples()
]

test_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in test_df.itertuples()
]

# DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
val_dataloader   = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
test_dataloader  = DataLoader(test_examples, shuffle=False, batch_size=batch_size)


## 5. Baseline Pre Fine Tuning

In [10]:
# --- Baseline Model: Pre-trained CrossEncoder (No Fine-Tuning) ---
from sentence_transformers import CrossEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score
import pandas as pd
import numpy as np

print("\n--- Starting Baseline Model: Pre-trained CrossEncoder ---")

# Initialize metric variables for baseline
baseline_global_roc_auc = None
baseline_global_ap = None
baseline_map = None
baseline_ndcg_3 = None
baseline_ndcg_5 = None
baseline_ndcg_10 = None

# --- Configuration ---
PRETRAINED_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2' 
MAX_LENGTH_PRETRAINED = 256

if 'test_df' not in locals() or not isinstance(test_df, pd.DataFrame) or test_df.empty:
    print("Error: `test_df` is not defined, not a DataFrame, or is empty.")
    print("Please ensure `test_df` is created and populated from your data splitting cells before running this baseline.")
else:
    print(f"Using test_df with shape: {test_df.shape} for pre-trained baseline evaluation.")
    baseline_model_instance = None # Renamed to avoid conflict if 'baseline_model' is used elsewhere
    try:
        baseline_model_instance = CrossEncoder(
            PRETRAINED_MODEL_NAME,
            max_length=MAX_LENGTH_PRETRAINED
        )
        print(f"Successfully loaded pre-trained model: {PRETRAINED_MODEL_NAME}")
    except Exception as e:
        print(f"Error loading pre-trained model {PRETRAINED_MODEL_NAME}: {e}")

    if baseline_model_instance:
        if 'user_ctx' not in test_df.columns or 'book_text' not in test_df.columns:
            print("Error: `test_df` is missing 'user_ctx' or 'book_text' columns.")
        else:
            test_pairs_for_baseline = [[row.user_ctx, row.book_text] for row in test_df.itertuples()]
            
            if not test_pairs_for_baseline:
                print("Error: `test_pairs_for_baseline` list is empty. Cannot make predictions.")
            else:
                print("Making predictions with the pre-trained model...")
                baseline_scores_pred = [] 
                try:
                    baseline_scores_pred = baseline_model_instance.predict(test_pairs_for_baseline, show_progress_bar=True)
                    
                    if 'label' not in test_df.columns:
                        print("Error: `test_df` is missing 'label' column for evaluation.")
                    else:
                        test_true_labels = test_df['label'].values

                        if len(baseline_scores_pred) == len(test_true_labels):
                            print(f"\n--- Pre-trained CrossEncoder Baseline Results ---")
                            
                            baseline_global_roc_auc = roc_auc_score(test_true_labels, baseline_scores_pred)
                            baseline_global_ap = average_precision_score(test_true_labels, baseline_scores_pred)
                            print(f"Global ROC AUC: {baseline_global_roc_auc:.4f}")
                            print(f"Global Average Precision (across all items): {baseline_global_ap:.4f}")

                            user_ndcg_scores_at_3_bl = [] # Suffix _bl for baseline
                            user_ndcg_scores_at_5_bl = []
                            user_ndcg_scores_at_10_bl = []
                            user_ap_scores_bl = []

                            if 'user_id' in test_df.columns:
                                evaluation_df_baseline = pd.DataFrame({
                                    'user_id': test_df['user_id'], 
                                    'label': test_true_labels,      
                                    'score': baseline_scores_pred 
                                })

                                for user_id_val in evaluation_df_baseline['user_id'].unique():
                                    user_data = evaluation_df_baseline[evaluation_df_baseline['user_id'] == user_id_val]
                                    
                                    if not user_data.empty and user_data['label'].sum() > 0:
                                        y_true_user_list = user_data['label'].values.tolist()
                                        y_score_user_list = user_data['score'].values.tolist()

                                        user_ap = average_precision_score(y_true_user_list, y_score_user_list)
                                        user_ap_scores_bl.append(user_ap)
                                        
                                        y_true_user_ndcg = [y_true_user_list]
                                        y_score_user_ndcg = [y_score_user_list]
                                        
                                        k3 = min(3, len(user_data))
                                        if k3 > 0:
                                            user_ndcg_3_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k3) # temp var
                                            user_ndcg_scores_at_3_bl.append(user_ndcg_3_val)
                                        
                                        k5 = min(5, len(user_data))
                                        if k5 > 0:
                                            user_ndcg_5_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k5) # temp var
                                            user_ndcg_scores_at_5_bl.append(user_ndcg_5_val)

                                        k10 = min(10, len(user_data))
                                        if k10 > 0:
                                             user_ndcg_10_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k10) # temp var
                                             user_ndcg_scores_at_10_bl.append(user_ndcg_10_val)
                                
                                if user_ap_scores_bl:
                                    baseline_map = np.mean(user_ap_scores_bl)
                                    print(f"Mean Average Precision (MAP): {baseline_map:.4f}")
                                else:
                                    print("MAP could not be calculated (no valid user AP scores).")

                                if user_ndcg_scores_at_3_bl:
                                    baseline_ndcg_3 = np.mean(user_ndcg_scores_at_3_bl)
                                    print(f"Mean Per-User NDCG@3: {baseline_ndcg_3:.4f}")
                                else:
                                    print("Mean Per-User NDCG@3 could not be calculated.")
                                
                                if user_ndcg_scores_at_5_bl:
                                    baseline_ndcg_5 = np.mean(user_ndcg_scores_at_5_bl)
                                    print(f"Mean Per-User NDCG@5: {baseline_ndcg_5:.4f}")
                                else:
                                    print("Mean Per-User NDCG@5 could not be calculated.")

                                if user_ndcg_scores_at_10_bl: 
                                    baseline_ndcg_10 = np.mean(user_ndcg_scores_at_10_bl)
                                    print(f"Mean Per-User NDCG@10: {baseline_ndcg_10:.4f}")
                                else:
                                    print("Mean Per-User NDCG@10 could not be calculated.")
                            else:
                                print("Error: 'user_id' column not found in test_df. Cannot calculate per-user metrics.")
                        else:
                            print(f"Error: Mismatch in length between predicted scores ({len(baseline_scores_pred)}) and true labels ({len(test_true_labels)}). Evaluation skipped.")
                except Exception as e:
                    print(f"Error during prediction or evaluation with pre-trained model: {e}")

print("--- Baseline Model: Pre-trained CrossEncoder Finished ---")

# Verification print for baseline metrics
print("\n--- Saved Metrics for Baseline Model (for later comparison) ---")
print(f"baseline_global_roc_auc: {baseline_global_roc_auc}")
print(f"baseline_global_ap: {baseline_global_ap}")
print(f"baseline_map: {baseline_map}")
print(f"baseline_ndcg_3: {baseline_ndcg_3}")
print(f"baseline_ndcg_5: {baseline_ndcg_5}")
print(f"baseline_ndcg_10: {baseline_ndcg_10}")


--- Starting Baseline Model: Pre-trained CrossEncoder ---
Using test_df with shape: (27804, 4) for pre-trained baseline evaluation.
Successfully loaded pre-trained model: cross-encoder/ms-marco-MiniLM-L-6-v2
Making predictions with the pre-trained model...


Batches: 100%|██████████| 869/869 [02:33<00:00,  5.68it/s]



--- Pre-trained CrossEncoder Baseline Results ---
Global ROC AUC: 0.5988
Global Average Precision (across all items): 0.4133
Mean Average Precision (MAP): 0.5921
Mean Per-User NDCG@3: 0.4806
Mean Per-User NDCG@5: 0.5731
Mean Per-User NDCG@10: 0.6940
--- Baseline Model: Pre-trained CrossEncoder Finished ---


Global ROC AUC: 0.5988
Interpretation: Still indicates a modest ability to globally distinguish positive from negative pairs, slightly better than random. This hasn't changed from the previous interpretation.
Global Average Precision (across all items): 0.4133
Interpretation: The model has some capability in ranking positive items higher than negative items when all user-item pairs are considered together. Again, better than a random baseline (which might be around 0.25 globally given your 1:3 pos/neg ratio within users).
Mean Average Precision (MAP): 0.5921
Interpretation: This is a strong indicator of per-user ranking quality. A MAP of ~0.59 is quite good for a pre-trained baseline. It means that, on average, when you look at the ranked list for each user, the precision is maintained reasonably well as you go down the list to find all relevant items. It's a more robust measure of overall per-user ranking than just looking at one point (like P@k).
Comparison to Global AP: Notice that MAP (0.5921) is significantly higher than the Global AP (0.4133). This is common and important. It suggests that while the model might struggle a bit when all items are jumbled, its performance within each user's individual list of 12 candidates is notably better. This is exactly what you want for a recommendation reranker.
Mean Per-User NDCG@3: 0.4806
Interpretation: This tells you about the quality of the ranking specifically for the top 3 positions. Since each user has exactly 3 positive items, this metric is crucial.
An NDCG@3 of ~0.48 means that, on average, the model is doing a moderately good job of getting the 3 positive items into the top 3 slots, but it's far from perfect. If it were perfect for every user (all 3 positives in the top 3), NDCG@3 would be 1.0.
This score suggests that often, one or more of the positive items might be ranked below position 3, or some negative items are intruding into the top 3.
Mean Per-User NDCG@5: 0.5731
Interpretation: Looking at the top 5 positions, the ranking quality improves. This means that even if not all 3 positive items make it into the top 3, they are often found within the top 5.
The increase from NDCG@3 (0.4806) to NDCG@5 (0.5731) is logical and expected.
Mean Per-User NDCG@10: 0.6940
Interpretation: This remains a good score. By considering the top 10 out of 12 items, the model has more opportunity to place all 3 positive items correctly, and this score reflects that. The jump from NDCG@5 to NDCG@10 is also substantial, indicating that many of the relevant items that weren't in the top 5 are indeed captured within the top 10.

## 5. Define Validation Evaluator

In [None]:
from sentence_transformers.evaluation import CrossEncoderEvaluator

evaluator = CrossEncoderEvaluator.from_input_examples(
    val_examples,     
    name='val',
    batch_size=16,
    main_score_function=lambda y_true, y_pred: ndcg_score([y_true], [y_pred], k=3)
)

## 5. Instantiate & Fine-Tune CrossEncoder

cross-encoder/ms-marco-MiniLM-L-6-v2 is a 6-layer MiniLM distilled into a cross-encoder architecture and pretrained on the MS MARCO passage ranking task. It takes a paired input (e.g. user context + book text) and produces a single relevance score via full token-level attention.

Justification
	•	Ranking-Tuned Pretraining
Its MS MARCO heritage means it already knows how to judge fine-grained relevance patterns—crucial for matching nuanced book descriptions to user tastes.
	•	Speed-Quality Sweet Spot
At ~60 MB and with inference under 15 ms per candidate, it delivers ~90–95 % of full BERT-base accuracy, keeping end-to-end latency low.
	•	Efficient Fine-Tuning
Requires only 2–3 epochs over ~150 K (user,book) pairs to adapt deeply to book-domain language, making rapid iteration feasible.
	•	Compact & Deployable
Its small footprint simplifies packaging, loading, and scaling in production environments with moderate memory and compute budgets.

In [11]:
epochs = 3
batch_size = 16
learning_rate = 2e-5

# 1. Calculate the number of training steps per epoch
num_train_steps_per_epoch = len(train_dataloader)
print(f"Number of training steps per epoch: {num_train_steps_per_epoch}")

# 2. Calculate warmup steps
warmup_ratio = 0.10  # 10% for warmup
actual_warmup_steps = int(num_train_steps_per_epoch * warmup_ratio)
print(f"Calculated warmup steps (10% of one epoch): {actual_warmup_steps}")

# Ensure warmup_steps is not zero if num_train_steps_per_epoch is very small
if actual_warmup_steps == 0 and num_train_steps_per_epoch > 0:
    actual_warmup_steps = 1 # Ensure at least 1 step if training is happening
    print(f"Adjusted warmup steps to 1 as calculated value was 0.")




Number of training steps per epoch: 13898
Calculated warmup steps (10% of one epoch): 1389


In [None]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-6-v2',
    num_labels=1,
    max_length=384,  
)

model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=epochs, # e.g., 3
    learning_rate=learning_rate, # e.g., 2e-5
    scheduler='WarmupLinear', 
    warmup_steps=actual_warmup_steps, 
    evaluation_steps=num_train_steps_per_epoch,  
    output_path=output_model_dir,
    save_best_model=True,
    use_amp=True,
    early_stopping=True,
    early_stopping_patience=3
)

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt # Make sure this is imported

# --- Plotting Validation Performance for CrossEncoder ---
print("\n--- Plotting CrossEncoder Validation Performance ---")

# Define where your model.fit() saved its output
# This should be the same as the 'output_path' you gave to model.fit()
output_model_dir_from_training = output_model_dir # Or directly use 'reranker_model' if that's the variable

# The evaluator was named 'val' in your setup
evaluator_name = 'val' 
eval_filename = f"{evaluator_name}_results.csv"

# Path to the evaluation results CSV file
# It's typically directly in the output_path for CrossEncoder, or sometimes in an 'eval' subfolder.
# Check your output_model_dir after training to confirm the exact path.
# Common paths:
# 1. output_model_dir / evaluator_name_results.csv
# 2. output_model_dir / eval / evaluator_name_results.csv

# Let's try the first common path:
eval_filepath = os.path.join(output_model_dir_from_training, eval_filename)

# If not found, try the 'eval' subdirectory (less common for CrossEncoder's default save)
if not os.path.exists(eval_filepath):
    eval_filepath_alt = os.path.join(output_model_dir_from_training, "eval", eval_filename)
    if os.path.exists(eval_filepath_alt):
        eval_filepath = eval_filepath_alt
    else:
        print(f"Could not find evaluation file at {eval_filepath} or {eval_filepath_alt}")
        # Set eval_filepath to a non-existent path to trigger FileNotFoundError below if needed
        eval_filepath = os.path.join(output_model_dir_from_training, "FILE_DOES_NOT_EXIST.csv")


# The main score column in the CSV will match the metric from your main_score_function
# Your main_score_function was ndcg_score with k=3.
# The column name is often the name of the evaluator + "_" + the metric name (or just the metric name).
# For NDCG from ndcg_score, it might be saved as 'ndcg' or 'score', or 'val_ndcg'.
# You'll need to INSPECT the CSV file after one training run to get the exact column name.
# Let's assume for now it's called 'score' (as it's the output of main_score_function) or 'ndcg'.

# Common potential column names for the score from your main_score_function
# The CrossEncoderEvaluator often names the column after the evaluator's name plus the metric.
# If main_score_function just returns a score, it might be 'score' or based on the lambda.
# Let's try a few common ones, or you can inspect the CSV header.
potential_score_column_names = [
    f'{evaluator_name}_ndcg', # e.g., val_ndcg if ndcg_score returns a dict key
    'score',                 # Generic name if the main_score_function directly returns the score
    'ndcg',                  # If the metric itself is named 'ndcg'
    # Add other possibilities based on inspecting your CSV file
]


try:
    eval_results_df = pd.read_csv(eval_filepath)
    print(f"Successfully loaded evaluation results from: {eval_filepath}")
    print("Columns in evaluation CSV:", eval_results_df.columns.tolist())

    # Determine the actual score column name
    score_column = None
    for col_name in potential_score_column_names:
        if col_name in eval_results_df.columns:
            score_column = col_name
            break
    
    if 'epoch' in eval_results_df.columns and 'steps' in eval_results_df.columns and score_column:
        plt.figure(figsize=(12, 6))
        
        # Plot score vs steps
        plt.subplot(1, 2, 1)
        plt.plot(eval_results_df['steps'], eval_results_df[score_column], marker='o', linestyle='-')
        plt.title(f'Validation {score_column.replace("_", " ").title()} vs. Training Steps')
        plt.xlabel('Training Steps')
        plt.ylabel(score_column.replace("_", " ").title())
        plt.grid(True)

        # Plot score vs epoch
        plt.subplot(1, 2, 2)
        epoch_end_evals = eval_results_df.loc[eval_results_df.groupby('epoch')['steps'].idxmax()]
        plt.plot(epoch_end_evals['epoch'], epoch_end_evals[score_column], marker='o', linestyle='-')
        plt.title(f'Validation {score_column.replace("_", " ").title()} vs. Epoch')
        plt.xlabel('Epoch')
        plt.ylabel(score_column.replace("_", " ").title())
        plt.xticks(epoch_end_evals['epoch'].unique()) 
        plt.grid(True)
        
        plt.tight_layout()
        
        # Save the plot
        # Ensure output_model_dir_from_training is defined correctly
        plot_save_path = os.path.join(output_model_dir_from_training, f'{evaluator_name}_performance_plot.png')
        plt.savefig(plot_save_path)
        print(f"Validation plot saved to: {plot_save_path}")
        plt.show()
    else:
        missing_cols = []
        if 'epoch' not in eval_results_df.columns: missing_cols.append('epoch')
        if 'steps' not in eval_results_df.columns: missing_cols.append('steps')
        if not score_column: missing_cols.append(f"one of {potential_score_column_names}")
        print(f"Required columns {missing_cols} not found in {eval_filepath}. Cannot plot.")

except FileNotFoundError:
    print(f"Evaluation results file not found. Checked paths ending with {eval_filename}")
    print("Plotting skipped. Ensure `output_path` was set in `model.fit()` and training completed at least one evaluation step.")
except Exception as e:
    print(f"An error occurred during plotting: {e}")


## 6. Evaluation on Test Set

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score
import pandas as pd
import numpy as np

print("\n--- Evaluating Fine-Tuned CrossEncoder Model on Test Set ---")

# Initialize metric variables to None or a default (e.g., 0.0) in case evaluation fails
finetuned_global_roc_auc = None
finetuned_global_ap = None
finetuned_map = None
finetuned_ndcg_3 = None
finetuned_ndcg_5 = None
finetuned_ndcg_10 = None

if 'test_df' not in locals() or not isinstance(test_df, pd.DataFrame) or test_df.empty:
    print("Error: `test_df` is not defined, not a DataFrame, or is empty.")
    print("Please ensure `test_df` is available for final model evaluation.")
else:
    if 'user_ctx' not in test_df.columns or 'book_text' not in test_df.columns or 'label' not in test_df.columns:
        print("Error: `test_df` is missing one or more required columns: 'user_ctx', 'book_text', 'label'.")
    else:
        # 1. Prepare test pairs for prediction
        test_pairs_for_model = [[row.user_ctx, row.book_text] for row in test_df.itertuples()]
        
        if not test_pairs_for_model:
            print("Error: `test_pairs_for_model` list is empty. Cannot make predictions.")
        else:
            print("Making predictions with the fine-tuned model...")
            # 2. Get Predictions from the fine-tuned model
            # Make sure your 'model' variable refers to the trained model instance
            try:
                model_scores = model.predict(test_pairs_for_model, show_progress_bar=True)
                true_labels = test_df['label'].values

                if len(model_scores) == len(true_labels):
                    print(f"\n--- Fine-Tuned Model Test Set Results ---")

                    # 3. Compute and Store Global Metrics
                    finetuned_global_roc_auc = roc_auc_score(true_labels, model_scores)
                    finetuned_global_ap = average_precision_score(true_labels, model_scores)
                    print(f"Global ROC AUC: {finetuned_global_roc_auc:.4f}")
                    print(f"Global Average Precision (across all items): {finetuned_global_ap:.4f}")

                    # 4. Compute and Store Per-User Metrics (MAP, NDCG@3, NDCG@5, NDCG@10)
                    user_ap_scores_ft = []
                    user_ndcg_scores_at_3_ft = []
                    user_ndcg_scores_at_5_ft = []
                    user_ndcg_scores_at_10_ft = []

                    if 'user_id' in test_df.columns:
                        evaluation_df_ft = pd.DataFrame({
                            'user_id': test_df['user_id'],
                            'label': true_labels,
                            'score': model_scores
                        })

                        for user_id_val in evaluation_df_ft['user_id'].unique():
                            user_data = evaluation_df_ft[evaluation_df_ft['user_id'] == user_id_val]

                            if not user_data.empty and user_data['label'].sum() > 0:
                                y_true_user_list = user_data['label'].values.tolist()
                                y_score_user_list = user_data['score'].values.tolist()

                                user_ap = average_precision_score(y_true_user_list, y_score_user_list)
                                user_ap_scores_ft.append(user_ap)

                                y_true_user_ndcg = [y_true_user_list]
                                y_score_user_ndcg = [y_score_user_list]

                                k3 = min(3, len(user_data))
                                if k3 > 0:
                                    user_ndcg_3 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k3)
                                    user_ndcg_scores_at_3_ft.append(user_ndcg_3)
                                
                                k5 = min(5, len(user_data))
                                if k5 > 0:
                                    user_ndcg_5 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k5)
                                    user_ndcg_scores_at_5_ft.append(user_ndcg_5)

                                k10 = min(10, len(user_data))
                                if k10 > 0:
                                    user_ndcg_10 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k10)
                                    user_ndcg_scores_at_10_ft.append(user_ndcg_10)
                        
                        if user_ap_scores_ft:
                            finetuned_map = np.mean(user_ap_scores_ft)
                            print(f"Mean Average Precision (MAP): {finetuned_map:.4f}")
                        else:
                            print("MAP could not be calculated.")

                        if user_ndcg_scores_at_3_ft:
                            finetuned_ndcg_3 = np.mean(user_ndcg_scores_at_3_ft)
                            print(f"Mean Per-User NDCG@3: {finetuned_ndcg_3:.4f}")
                        else:
                            print("Mean Per-User NDCG@3 could not be calculated.")
                        
                        if user_ndcg_scores_at_5_ft:
                            finetuned_ndcg_5 = np.mean(user_ndcg_scores_at_5_ft)
                            print(f"Mean Per-User NDCG@5: {finetuned_ndcg_5:.4f}")
                        else:
                            print("Mean Per-User NDCG@5 could not be calculated.")

                        if user_ndcg_scores_at_10_ft:
                            finetuned_ndcg_10 = np.mean(user_ndcg_scores_at_10_ft)
                            print(f"Mean Per-User NDCG@10: {finetuned_ndcg_10:.4f}")
                        else:
                            print("Mean Per-User NDCG@10 could not be calculated.")
                    else:
                        print("Error: 'user_id' column not found in test_df. Cannot calculate per-user metrics.")
                else:
                    print(f"Error: Mismatch in length between model_scores ({len(model_scores)}) and true_labels ({len(true_labels)}). Evaluation skipped.")
            except Exception as e:
                print(f"Error during prediction or evaluation with the fine-tuned model: {e}")

print("--- Fine-Tuned Model Evaluation Finished ---")

# You can now verify the variables:
print("\n--- Saved Metrics for Fine-Tuned Model (for later comparison) ---")
print(f"finetuned_global_roc_auc: {finetuned_global_roc_auc}")
print(f"finetuned_global_ap: {finetuned_global_ap}")
print(f"finetuned_map: {finetuned_map}")
print(f"finetuned_ndcg_3: {finetuned_ndcg_3}")
print(f"finetuned_ndcg_5: {finetuned_ndcg_5}")
print(f"finetuned_ndcg_10: {finetuned_ndcg_10}")

## 7. Comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- Metrics from Baseline Model ---
baseline_metrics = {
    'Global ROC AUC': baseline_global_roc_auc,
    'Global AP': baseline_global_ap,
    'MAP': baseline_map,
    'NDCG@3': baseline_ndcg_3,
    'NDCG@5': baseline_ndcg_5,
    'NDCG@10': baseline_ndcg_10
}

# --- Metrics from Fine-Tuned Model ---
finetuned_metrics = {
    'Global ROC AUC': finetuned_global_roc_auc,  
    'Global AP': finetuned_global_ap,       
    'MAP': finetuned_map,             
    'NDCG@3': finetuned_ndcg_3,          
    'NDCG@5': finetuned_ndcg_5,           
    'NDCG@10': finetuned_ndcg_10         
}

# 1. Create a Comparison Table
metrics_data = {
    'Metric': list(baseline_metrics.keys()),
    'Baseline': list(baseline_metrics.values()),
    'Fine-Tuned': list(finetuned_metrics.values())
}
comparison_df = pd.DataFrame(metrics_data)
comparison_df['Improvement'] = comparison_df['Fine-Tuned'] - comparison_df['Baseline']
comparison_df['Improvement (%)'] = (comparison_df['Improvement'] / comparison_df['Baseline']) * 100
comparison_df['Improvement (%)'] = comparison_df['Improvement (%)'].round(2)

print("--- Model Performance Comparison ---")
print(comparison_df.to_string(index=False))
print("\n" + "="*50 + "\n")


# 2. Generate Bar Charts for Visual Comparison
metric_names = list(baseline_metrics.keys())
baseline_values = list(baseline_metrics.values())
finetuned_values = list(finetuned_metrics.values())

num_metrics = len(metric_names)
bar_width = 0.35
index = np.arange(num_metrics)

fig, ax = plt.subplots(figsize=(12, 7))

rects1 = ax.bar(index - bar_width/2, baseline_values, bar_width, label='Baseline', color='skyblue')
rects2 = ax.bar(index + bar_width/2, finetuned_values, bar_width, label='Fine-Tuned', color='coral')

ax.set_ylabel('Scores')
ax.set_title('Baseline vs. Fine-Tuned Model Performance by Metric')
ax.set_xticks(index)
ax.set_xticklabels(metric_names, rotation=45, ha="right")
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom',
                    fontsize=8)

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()

key_per_user_metrics = ['MAP', 'NDCG@3', 'NDCG@5', 'NDCG@10']
key_baseline_values = [baseline_metrics[m] for m in key_per_user_metrics]
key_finetuned_values = [finetuned_metrics[m] for m in key_per_user_metrics]

num_key_metrics = len(key_per_user_metrics)
index_key = np.arange(num_key_metrics)

fig_key, ax_key = plt.subplots(figsize=(10, 6))
rects1_key = ax_key.bar(index_key - bar_width/2, key_baseline_values, bar_width, label='Baseline', color='skyblue')
rects2_key = ax_key.bar(index_key + bar_width/2, key_finetuned_values, bar_width, label='Fine-Tuned', color='coral')

ax_key.set_ylabel('Scores')
ax_key.set_title('Key Per-User Metrics: Baseline vs. Fine-Tuned')
ax_key.set_xticks(index_key)
ax_key.set_xticklabels(key_per_user_metrics)
ax_key.legend()

autolabel(rects1_key)
autolabel(rects2_key)

fig_key.tight_layout()
plt.show()


## 7. Save & Load Model

In [None]:
# Model is already saved during fit
# To load:
from sentence_transformers import CrossEncoder
loaded_model = CrossEncoder(output_model_dir)


## 8. Inference Example

In [None]:
# Given a single user and its candidates
user_ctx = "Favorite books: ..."  # fetched or precomputed
candidate_texts = ["Title: ... Description: ...", ...]
pairs = [[user_ctx, txt] for txt in candidate_texts]
scores = loaded_model.predict(pairs)

# Rerank
candidates = ['book1', 'book2', ...]
ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
print(ranked[:10])