# Cross-Encoder Fine-Tuning & Evaluation Notebook

This notebook outlines the end-to-end process for fine-tuning a Sentence-Transformers `CrossEncoder`.

## 1. Install & Import Dependencies

In [2]:
import os
import random
import pandas as pd
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, roc_auc_score, average_precision_score

  from .autonotebook import tqdm as notebook_tqdm


## 2. Configuration

In [3]:
# Paths
data_path = 'data/processed_training_pairs_parts_0_to_12.parquet'
output_model_dir = 'reranker_model'

In [4]:
random_seed = 42

# Reproducibility
random.seed(random_seed)

## 3. Load & Split Data

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

# 1) Load all pairs and drop book_id
df = pd.read_parquet(data_path)
df = df.drop('book_id', axis=1)

# 2) Get unique users
users = df['user_id'].unique()

# 3) First split: 80% train, 20% temp (val+test)
train_users, temp_users = train_test_split(
    users, 
    test_size=0.2, 
    random_state=random_seed
)

# 4) Second split: half of temp → val (10%), half → test (10%)
val_users, test_users = train_test_split(
    temp_users, 
    test_size=0.5, 
    random_state=random_seed
)

# 5) Build DataFrames
train_df = df[df['user_id'].isin(train_users)].reset_index(drop=True)
val_df   = df[df['user_id'].isin(val_users)].reset_index(drop=True)
test_df  = df[df['user_id'].isin(test_users)].reset_index(drop=True)

print(f"Train pairs: {len(train_df)}, Val pairs: {len(val_df)}, Test pairs: {len(test_df)}")

Train pairs: 222360, Val pairs: 27792, Test pairs: 27804


In [6]:
train_df.head()

Unnamed: 0,user_id,user_ctx,book_text,label
0,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Down and Out in Paris and London | Genr...,1
1,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Virgin Suicides | Genres: coming-of...,0
2,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Seducing Cinderella (Fighting for Love,...",0
3,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Highlander Untamed (MacLeods of Skye Tr...,0
4,001af7947e217e17694c5a9c097afffb,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: A Woman in Berlin: Eight Weeks in the C...,1


## 4. Prepare InputExamples & DataLoaders

In [7]:
# Convert to InputExample
train_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in train_df.itertuples()
]

val_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in val_df.itertuples()
]

test_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in test_df.itertuples()
]

# DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
val_dataloader   = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
test_dataloader  = DataLoader(test_examples, shuffle=False, batch_size=batch_size)


## 5. Baseline Pre Fine Tuning

In [10]:
# --- Baseline Model: Pre-trained CrossEncoder (No Fine-Tuning) ---
from sentence_transformers import CrossEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score
import pandas as pd
import numpy as np

print("\n--- Starting Baseline Model: Pre-trained CrossEncoder ---")

# Initialize metric variables for baseline
baseline_global_roc_auc = None
baseline_global_ap = None
baseline_map = None
baseline_ndcg_3 = None
baseline_ndcg_5 = None
baseline_ndcg_10 = None

# --- Configuration ---
PRETRAINED_MODEL_NAME = 'cross-encoder/ms-marco-MiniLM-L-6-v2' 
MAX_LENGTH_PRETRAINED = 256

if 'test_df' not in locals() or not isinstance(test_df, pd.DataFrame) or test_df.empty:
    print("Error: `test_df` is not defined, not a DataFrame, or is empty.")
    print("Please ensure `test_df` is created and populated from your data splitting cells before running this baseline.")
else:
    print(f"Using test_df with shape: {test_df.shape} for pre-trained baseline evaluation.")
    baseline_model_instance = None # Renamed to avoid conflict if 'baseline_model' is used elsewhere
    try:
        baseline_model_instance = CrossEncoder(
            PRETRAINED_MODEL_NAME,
            max_length=MAX_LENGTH_PRETRAINED
        )
        print(f"Successfully loaded pre-trained model: {PRETRAINED_MODEL_NAME}")
    except Exception as e:
        print(f"Error loading pre-trained model {PRETRAINED_MODEL_NAME}: {e}")

    if baseline_model_instance:
        if 'user_ctx' not in test_df.columns or 'book_text' not in test_df.columns:
            print("Error: `test_df` is missing 'user_ctx' or 'book_text' columns.")
        else:
            test_pairs_for_baseline = [[row.user_ctx, row.book_text] for row in test_df.itertuples()]
            
            if not test_pairs_for_baseline:
                print("Error: `test_pairs_for_baseline` list is empty. Cannot make predictions.")
            else:
                print("Making predictions with the pre-trained model...")
                baseline_scores_pred = [] 
                try:
                    baseline_scores_pred = baseline_model_instance.predict(test_pairs_for_baseline, show_progress_bar=True)
                    
                    if 'label' not in test_df.columns:
                        print("Error: `test_df` is missing 'label' column for evaluation.")
                    else:
                        test_true_labels = test_df['label'].values

                        if len(baseline_scores_pred) == len(test_true_labels):
                            print(f"\n--- Pre-trained CrossEncoder Baseline Results ---")
                            
                            baseline_global_roc_auc = roc_auc_score(test_true_labels, baseline_scores_pred)
                            baseline_global_ap = average_precision_score(test_true_labels, baseline_scores_pred)
                            print(f"Global ROC AUC: {baseline_global_roc_auc:.4f}")
                            print(f"Global Average Precision (across all items): {baseline_global_ap:.4f}")

                            user_ndcg_scores_at_3_bl = [] # Suffix _bl for baseline
                            user_ndcg_scores_at_5_bl = []
                            user_ndcg_scores_at_10_bl = []
                            user_ap_scores_bl = []

                            if 'user_id' in test_df.columns:
                                evaluation_df_baseline = pd.DataFrame({
                                    'user_id': test_df['user_id'], 
                                    'label': test_true_labels,      
                                    'score': baseline_scores_pred 
                                })

                                for user_id_val in evaluation_df_baseline['user_id'].unique():
                                    user_data = evaluation_df_baseline[evaluation_df_baseline['user_id'] == user_id_val]
                                    
                                    if not user_data.empty and user_data['label'].sum() > 0:
                                        y_true_user_list = user_data['label'].values.tolist()
                                        y_score_user_list = user_data['score'].values.tolist()

                                        user_ap = average_precision_score(y_true_user_list, y_score_user_list)
                                        user_ap_scores_bl.append(user_ap)
                                        
                                        y_true_user_ndcg = [y_true_user_list]
                                        y_score_user_ndcg = [y_score_user_list]
                                        
                                        k3 = min(3, len(user_data))
                                        if k3 > 0:
                                            user_ndcg_3_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k3) # temp var
                                            user_ndcg_scores_at_3_bl.append(user_ndcg_3_val)
                                        
                                        k5 = min(5, len(user_data))
                                        if k5 > 0:
                                            user_ndcg_5_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k5) # temp var
                                            user_ndcg_scores_at_5_bl.append(user_ndcg_5_val)

                                        k10 = min(10, len(user_data))
                                        if k10 > 0:
                                             user_ndcg_10_val = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k10) # temp var
                                             user_ndcg_scores_at_10_bl.append(user_ndcg_10_val)
                                
                                if user_ap_scores_bl:
                                    baseline_map = np.mean(user_ap_scores_bl)
                                    print(f"Mean Average Precision (MAP): {baseline_map:.4f}")
                                else:
                                    print("MAP could not be calculated (no valid user AP scores).")

                                if user_ndcg_scores_at_3_bl:
                                    baseline_ndcg_3 = np.mean(user_ndcg_scores_at_3_bl)
                                    print(f"Mean Per-User NDCG@3: {baseline_ndcg_3:.4f}")
                                else:
                                    print("Mean Per-User NDCG@3 could not be calculated.")
                                
                                if user_ndcg_scores_at_5_bl:
                                    baseline_ndcg_5 = np.mean(user_ndcg_scores_at_5_bl)
                                    print(f"Mean Per-User NDCG@5: {baseline_ndcg_5:.4f}")
                                else:
                                    print("Mean Per-User NDCG@5 could not be calculated.")

                                if user_ndcg_scores_at_10_bl: 
                                    baseline_ndcg_10 = np.mean(user_ndcg_scores_at_10_bl)
                                    print(f"Mean Per-User NDCG@10: {baseline_ndcg_10:.4f}")
                                else:
                                    print("Mean Per-User NDCG@10 could not be calculated.")
                            else:
                                print("Error: 'user_id' column not found in test_df. Cannot calculate per-user metrics.")
                        else:
                            print(f"Error: Mismatch in length between predicted scores ({len(baseline_scores_pred)}) and true labels ({len(test_true_labels)}). Evaluation skipped.")
                except Exception as e:
                    print(f"Error during prediction or evaluation with pre-trained model: {e}")

print("--- Baseline Model: Pre-trained CrossEncoder Finished ---")

# Verification print for baseline metrics
print("\n--- Saved Metrics for Baseline Model (for later comparison) ---")
print(f"baseline_global_roc_auc: {baseline_global_roc_auc}")
print(f"baseline_global_ap: {baseline_global_ap}")
print(f"baseline_map: {baseline_map}")
print(f"baseline_ndcg_3: {baseline_ndcg_3}")
print(f"baseline_ndcg_5: {baseline_ndcg_5}")
print(f"baseline_ndcg_10: {baseline_ndcg_10}")


--- Starting Baseline Model: Pre-trained CrossEncoder ---
Using test_df with shape: (27804, 4) for pre-trained baseline evaluation.
Successfully loaded pre-trained model: cross-encoder/ms-marco-MiniLM-L-6-v2
Making predictions with the pre-trained model...


Batches: 100%|██████████| 869/869 [02:33<00:00,  5.68it/s]



--- Pre-trained CrossEncoder Baseline Results ---
Global ROC AUC: 0.5988
Global Average Precision (across all items): 0.4133
Mean Average Precision (MAP): 0.5921
Mean Per-User NDCG@3: 0.4806
Mean Per-User NDCG@5: 0.5731
Mean Per-User NDCG@10: 0.6940
--- Baseline Model: Pre-trained CrossEncoder Finished ---


## 5. Define Validation Evaluator

In [None]:
from sentence_transformers.evaluation import CrossEncoderEvaluator

evaluator = CrossEncoderEvaluator.from_input_examples(
    val_examples,     
    name='val',
    batch_size=16,
    main_score_function=lambda y_true, y_pred: ndcg_score([y_true], [y_pred], k=3)
)

## 5. Instantiate & Fine-Tune CrossEncoder

cross-encoder/ms-marco-MiniLM-L-6-v2 is a 6-layer MiniLM distilled into a cross-encoder architecture and pretrained on the MS MARCO passage ranking task. It takes a paired input (e.g. user context + book text) and produces a single relevance score via full token-level attention.

Justification
	•	Ranking-Tuned Pretraining
Its MS MARCO heritage means it already knows how to judge fine-grained relevance patterns—crucial for matching nuanced book descriptions to user tastes.
	•	Speed-Quality Sweet Spot
At ~60 MB and with inference under 15 ms per candidate, it delivers ~90–95 % of full BERT-base accuracy, keeping end-to-end latency low.
	•	Efficient Fine-Tuning
Requires only 2–3 epochs over ~150 K (user,book) pairs to adapt deeply to book-domain language, making rapid iteration feasible.
	•	Compact & Deployable
Its small footprint simplifies packaging, loading, and scaling in production environments with moderate memory and compute budgets.

In [11]:
epochs = 3
batch_size = 16
learning_rate = 2e-5

# 1. Calculate the number of training steps per epoch
num_train_steps_per_epoch = len(train_dataloader)
print(f"Number of training steps per epoch: {num_train_steps_per_epoch}")

# 2. Calculate warmup steps
warmup_ratio = 0.10  # 10% for warmup
actual_warmup_steps = int(num_train_steps_per_epoch * warmup_ratio)
print(f"Calculated warmup steps (10% of one epoch): {actual_warmup_steps}")

# Ensure warmup_steps is not zero if num_train_steps_per_epoch is very small
if actual_warmup_steps == 0 and num_train_steps_per_epoch > 0:
    actual_warmup_steps = 1 # Ensure at least 1 step if training is happening
    print(f"Adjusted warmup steps to 1 as calculated value was 0.")




Number of training steps per epoch: 13898
Calculated warmup steps (10% of one epoch): 1389


In [None]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-6-v2',
    num_labels=1,
    max_length=384,  
)

model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    epochs=epochs, # e.g., 3
    learning_rate=learning_rate, # e.g., 2e-5
    scheduler='WarmupLinear', 
    warmup_steps=actual_warmup_steps, 
    evaluation_steps=num_train_steps_per_epoch,  
    output_path=output_model_dir,
    save_best_model=True,
    use_amp=True,
    early_stopping=True,
    early_stopping_patience=3
)

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt # Make sure this is imported

# --- Plotting Validation Performance for CrossEncoder ---
print("\n--- Plotting CrossEncoder Validation Performance ---")

# Define where your model.fit() saved its output
# This should be the same as the 'output_path' you gave to model.fit()
output_model_dir_from_training = output_model_dir # Or directly use 'reranker_model' if that's the variable

# The evaluator was named 'val' in your setup
evaluator_name = 'val' 
eval_filename = f"{evaluator_name}_results.csv"

# Path to the evaluation results CSV file
# It's typically directly in the output_path for CrossEncoder, or sometimes in an 'eval' subfolder.
# Check your output_model_dir after training to confirm the exact path.
# Common paths:
# 1. output_model_dir / evaluator_name_results.csv
# 2. output_model_dir / eval / evaluator_name_results.csv

# Let's try the first common path:
eval_filepath = os.path.join(output_model_dir_from_training, eval_filename)

# If not found, try the 'eval' subdirectory (less common for CrossEncoder's default save)
if not os.path.exists(eval_filepath):
    eval_filepath_alt = os.path.join(output_model_dir_from_training, "eval", eval_filename)
    if os.path.exists(eval_filepath_alt):
        eval_filepath = eval_filepath_alt
    else:
        print(f"Could not find evaluation file at {eval_filepath} or {eval_filepath_alt}")
        # Set eval_filepath to a non-existent path to trigger FileNotFoundError below if needed
        eval_filepath = os.path.join(output_model_dir_from_training, "FILE_DOES_NOT_EXIST.csv")


# The main score column in the CSV will match the metric from your main_score_function
# Your main_score_function was ndcg_score with k=3.
# The column name is often the name of the evaluator + "_" + the metric name (or just the metric name).
# For NDCG from ndcg_score, it might be saved as 'ndcg' or 'score', or 'val_ndcg'.
# You'll need to INSPECT the CSV file after one training run to get the exact column name.
# Let's assume for now it's called 'score' (as it's the output of main_score_function) or 'ndcg'.

# Common potential column names for the score from your main_score_function
# The CrossEncoderEvaluator often names the column after the evaluator's name plus the metric.
# If main_score_function just returns a score, it might be 'score' or based on the lambda.
# Let's try a few common ones, or you can inspect the CSV header.
potential_score_column_names = [
    f'{evaluator_name}_ndcg', # e.g., val_ndcg if ndcg_score returns a dict key
    'score',                 # Generic name if the main_score_function directly returns the score
    'ndcg',                  # If the metric itself is named 'ndcg'
    # Add other possibilities based on inspecting your CSV file
]


try:
    eval_results_df = pd.read_csv(eval_filepath)
    print(f"Successfully loaded evaluation results from: {eval_filepath}")
    print("Columns in evaluation CSV:", eval_results_df.columns.tolist())

    # Determine the actual score column name
    score_column = None
    for col_name in potential_score_column_names:
        if col_name in eval_results_df.columns:
            score_column = col_name
            break
    
    if 'epoch' in eval_results_df.columns and 'steps' in eval_results_df.columns and score_column:
        plt.figure(figsize=(12, 6))
        
        # Plot score vs steps
        plt.subplot(1, 2, 1)
        plt.plot(eval_results_df['steps'], eval_results_df[score_column], marker='o', linestyle='-')
        plt.title(f'Validation {score_column.replace("_", " ").title()} vs. Training Steps')
        plt.xlabel('Training Steps')
        plt.ylabel(score_column.replace("_", " ").title())
        plt.grid(True)

        # Plot score vs epoch
        plt.subplot(1, 2, 2)
        epoch_end_evals = eval_results_df.loc[eval_results_df.groupby('epoch')['steps'].idxmax()]
        plt.plot(epoch_end_evals['epoch'], epoch_end_evals[score_column], marker='o', linestyle='-')
        plt.title(f'Validation {score_column.replace("_", " ").title()} vs. Epoch')
        plt.xlabel('Epoch')
        plt.ylabel(score_column.replace("_", " ").title())
        plt.xticks(epoch_end_evals['epoch'].unique()) 
        plt.grid(True)
        
        plt.tight_layout()
        
        # Save the plot
        # Ensure output_model_dir_from_training is defined correctly
        plot_save_path = os.path.join(output_model_dir_from_training, f'{evaluator_name}_performance_plot.png')
        plt.savefig(plot_save_path)
        print(f"Validation plot saved to: {plot_save_path}")
        plt.show()
    else:
        missing_cols = []
        if 'epoch' not in eval_results_df.columns: missing_cols.append('epoch')
        if 'steps' not in eval_results_df.columns: missing_cols.append('steps')
        if not score_column: missing_cols.append(f"one of {potential_score_column_names}")
        print(f"Required columns {missing_cols} not found in {eval_filepath}. Cannot plot.")

except FileNotFoundError:
    print(f"Evaluation results file not found. Checked paths ending with {eval_filename}")
    print("Plotting skipped. Ensure `output_path` was set in `model.fit()` and training completed at least one evaluation step.")
except Exception as e:
    print(f"An error occurred during plotting: {e}")


## 6. Evaluation on Test Set

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score
import pandas as pd
import numpy as np

print("\n--- Evaluating Fine-Tuned CrossEncoder Model on Test Set ---")

# Initialize metric variables to None or a default (e.g., 0.0) in case evaluation fails
finetuned_global_roc_auc = None
finetuned_global_ap = None
finetuned_map = None
finetuned_ndcg_3 = None
finetuned_ndcg_5 = None
finetuned_ndcg_10 = None

if 'test_df' not in locals() or not isinstance(test_df, pd.DataFrame) or test_df.empty:
    print("Error: `test_df` is not defined, not a DataFrame, or is empty.")
    print("Please ensure `test_df` is available for final model evaluation.")
else:
    if 'user_ctx' not in test_df.columns or 'book_text' not in test_df.columns or 'label' not in test_df.columns:
        print("Error: `test_df` is missing one or more required columns: 'user_ctx', 'book_text', 'label'.")
    else:
        # 1. Prepare test pairs for prediction
        test_pairs_for_model = [[row.user_ctx, row.book_text] for row in test_df.itertuples()]
        
        if not test_pairs_for_model:
            print("Error: `test_pairs_for_model` list is empty. Cannot make predictions.")
        else:
            print("Making predictions with the fine-tuned model...")
            # 2. Get Predictions from the fine-tuned model
            # Make sure your 'model' variable refers to the trained model instance
            try:
                model_scores = model.predict(test_pairs_for_model, show_progress_bar=True)
                true_labels = test_df['label'].values

                if len(model_scores) == len(true_labels):
                    print(f"\n--- Fine-Tuned Model Test Set Results ---")

                    # 3. Compute and Store Global Metrics
                    finetuned_global_roc_auc = roc_auc_score(true_labels, model_scores)
                    finetuned_global_ap = average_precision_score(true_labels, model_scores)
                    print(f"Global ROC AUC: {finetuned_global_roc_auc:.4f}")
                    print(f"Global Average Precision (across all items): {finetuned_global_ap:.4f}")

                    # 4. Compute and Store Per-User Metrics (MAP, NDCG@3, NDCG@5, NDCG@10)
                    user_ap_scores_ft = []
                    user_ndcg_scores_at_3_ft = []
                    user_ndcg_scores_at_5_ft = []
                    user_ndcg_scores_at_10_ft = []

                    if 'user_id' in test_df.columns:
                        evaluation_df_ft = pd.DataFrame({
                            'user_id': test_df['user_id'],
                            'label': true_labels,
                            'score': model_scores
                        })

                        for user_id_val in evaluation_df_ft['user_id'].unique():
                            user_data = evaluation_df_ft[evaluation_df_ft['user_id'] == user_id_val]

                            if not user_data.empty and user_data['label'].sum() > 0:
                                y_true_user_list = user_data['label'].values.tolist()
                                y_score_user_list = user_data['score'].values.tolist()

                                user_ap = average_precision_score(y_true_user_list, y_score_user_list)
                                user_ap_scores_ft.append(user_ap)

                                y_true_user_ndcg = [y_true_user_list]
                                y_score_user_ndcg = [y_score_user_list]

                                k3 = min(3, len(user_data))
                                if k3 > 0:
                                    user_ndcg_3 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k3)
                                    user_ndcg_scores_at_3_ft.append(user_ndcg_3)
                                
                                k5 = min(5, len(user_data))
                                if k5 > 0:
                                    user_ndcg_5 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k5)
                                    user_ndcg_scores_at_5_ft.append(user_ndcg_5)

                                k10 = min(10, len(user_data))
                                if k10 > 0:
                                    user_ndcg_10 = ndcg_score(y_true_user_ndcg, y_score_user_ndcg, k=k10)
                                    user_ndcg_scores_at_10_ft.append(user_ndcg_10)
                        
                        if user_ap_scores_ft:
                            finetuned_map = np.mean(user_ap_scores_ft)
                            print(f"Mean Average Precision (MAP): {finetuned_map:.4f}")
                        else:
                            print("MAP could not be calculated.")

                        if user_ndcg_scores_at_3_ft:
                            finetuned_ndcg_3 = np.mean(user_ndcg_scores_at_3_ft)
                            print(f"Mean Per-User NDCG@3: {finetuned_ndcg_3:.4f}")
                        else:
                            print("Mean Per-User NDCG@3 could not be calculated.")
                        
                        if user_ndcg_scores_at_5_ft:
                            finetuned_ndcg_5 = np.mean(user_ndcg_scores_at_5_ft)
                            print(f"Mean Per-User NDCG@5: {finetuned_ndcg_5:.4f}")
                        else:
                            print("Mean Per-User NDCG@5 could not be calculated.")

                        if user_ndcg_scores_at_10_ft:
                            finetuned_ndcg_10 = np.mean(user_ndcg_scores_at_10_ft)
                            print(f"Mean Per-User NDCG@10: {finetuned_ndcg_10:.4f}")
                        else:
                            print("Mean Per-User NDCG@10 could not be calculated.")
                    else:
                        print("Error: 'user_id' column not found in test_df. Cannot calculate per-user metrics.")
                else:
                    print(f"Error: Mismatch in length between model_scores ({len(model_scores)}) and true_labels ({len(true_labels)}). Evaluation skipped.")
            except Exception as e:
                print(f"Error during prediction or evaluation with the fine-tuned model: {e}")

print("--- Fine-Tuned Model Evaluation Finished ---")

# You can now verify the variables:
print("\n--- Saved Metrics for Fine-Tuned Model (for later comparison) ---")
print(f"finetuned_global_roc_auc: {finetuned_global_roc_auc}")
print(f"finetuned_global_ap: {finetuned_global_ap}")
print(f"finetuned_map: {finetuned_map}")
print(f"finetuned_ndcg_3: {finetuned_ndcg_3}")
print(f"finetuned_ndcg_5: {finetuned_ndcg_5}")
print(f"finetuned_ndcg_10: {finetuned_ndcg_10}")

## 7. Comparison

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- Metrics from Baseline Model ---
baseline_metrics = {
    'Global ROC AUC': baseline_global_roc_auc,
    'Global AP': baseline_global_ap,
    'MAP': baseline_map,
    'NDCG@3': baseline_ndcg_3,
    'NDCG@5': baseline_ndcg_5,
    'NDCG@10': baseline_ndcg_10
}

# --- Metrics from Fine-Tuned Model ---
finetuned_metrics = {
    'Global ROC AUC': finetuned_global_roc_auc,  
    'Global AP': finetuned_global_ap,       
    'MAP': finetuned_map,             
    'NDCG@3': finetuned_ndcg_3,          
    'NDCG@5': finetuned_ndcg_5,           
    'NDCG@10': finetuned_ndcg_10         
}

# 1. Create a Comparison Table
metrics_data = {
    'Metric': list(baseline_metrics.keys()),
    'Baseline': list(baseline_metrics.values()),
    'Fine-Tuned': list(finetuned_metrics.values())
}
comparison_df = pd.DataFrame(metrics_data)
comparison_df['Improvement'] = comparison_df['Fine-Tuned'] - comparison_df['Baseline']
comparison_df['Improvement (%)'] = (comparison_df['Improvement'] / comparison_df['Baseline']) * 100
comparison_df['Improvement (%)'] = comparison_df['Improvement (%)'].round(2)

print("--- Model Performance Comparison ---")
print(comparison_df.to_string(index=False))
print("\n" + "="*50 + "\n")


# 2. Generate Bar Charts for Visual Comparison
metric_names = list(baseline_metrics.keys())
baseline_values = list(baseline_metrics.values())
finetuned_values = list(finetuned_metrics.values())

num_metrics = len(metric_names)
bar_width = 0.35
index = np.arange(num_metrics)

fig, ax = plt.subplots(figsize=(12, 7))

rects1 = ax.bar(index - bar_width/2, baseline_values, bar_width, label='Baseline', color='skyblue')
rects2 = ax.bar(index + bar_width/2, finetuned_values, bar_width, label='Fine-Tuned', color='coral')

ax.set_ylabel('Scores')
ax.set_title('Baseline vs. Fine-Tuned Model Performance by Metric')
ax.set_xticks(index)
ax.set_xticklabels(metric_names, rotation=45, ha="right")
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom',
                    fontsize=8)

autolabel(rects1)
autolabel(rects2)

fig.tight_layout()
plt.show()

key_per_user_metrics = ['MAP', 'NDCG@3', 'NDCG@5', 'NDCG@10']
key_baseline_values = [baseline_metrics[m] for m in key_per_user_metrics]
key_finetuned_values = [finetuned_metrics[m] for m in key_per_user_metrics]

num_key_metrics = len(key_per_user_metrics)
index_key = np.arange(num_key_metrics)

fig_key, ax_key = plt.subplots(figsize=(10, 6))
rects1_key = ax_key.bar(index_key - bar_width/2, key_baseline_values, bar_width, label='Baseline', color='skyblue')
rects2_key = ax_key.bar(index_key + bar_width/2, key_finetuned_values, bar_width, label='Fine-Tuned', color='coral')

ax_key.set_ylabel('Scores')
ax_key.set_title('Key Per-User Metrics: Baseline vs. Fine-Tuned')
ax_key.set_xticks(index_key)
ax_key.set_xticklabels(key_per_user_metrics)
ax_key.legend()

autolabel(rects1_key)
autolabel(rects2_key)

fig_key.tight_layout()
plt.show()


## 7. Save & Load Model

In [None]:
# Model is already saved during fit
# To load:
from sentence_transformers import CrossEncoder
loaded_model = CrossEncoder(output_model_dir)


## 8. Inference Example

In [None]:
# Given a single user and its candidates
user_ctx = "Favorite books: ..."  # fetched or precomputed
candidate_texts = ["Title: ... Description: ...", ...]
pairs = [[user_ctx, txt] for txt in candidate_texts]
scores = loaded_model.predict(pairs)

# Rerank
candidates = ['book1', 'book2', ...]
ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
print(ranked[:10])