# Cross-Encoder Fine-Tuning & Evaluation Notebook

This notebook outlines the end-to-end process for fine-tuning a Sentence-Transformers `CrossEncoder`.

## 1. Install & Import Dependencies

In [None]:
import os
import random
import pandas as pd
from sentence_transformers import CrossEncoder, InputExample
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, roc_auc_score, average_precision_score

In [1]:
import os
import boto3
from botocore.config import Config

def download_from_r2(object_name, local_path, bucket_name="bookdbio"):
    # ensure parent dir exists
    parent_dir = os.path.dirname(local_path)
    if parent_dir and not os.path.isdir(parent_dir):
        os.makedirs(parent_dir, exist_ok=True)

    s3 = boto3.client('s3',
        endpoint_url = f"https://a9a190ee80813000e18bacf626b1281b.r2.cloudflarestorage.com/",
        aws_access_key_id = '85fec6dd1268801ac8c1c59175ba0b76',
        aws_secret_access_key = '798b753bab748f2c7f5e0f46fd6506b7f0b206e362b1e00055d060a72b88d55d',
        config = Config(signature_version='s3v4')
   )

    try:
        s3.download_file(bucket_name, object_name, local_path)
        print(f"Successfully downloaded {object_name} to {local_path}")
    except Exception as e:
        print(f"Download failed for {object_name}: {e}")

In [2]:
download_from_r2("data/training_pairs.parquet.zip", "data/training_pairs.parquet.zip")

Successfully downloaded data/training_pairs.parquet.zip to data/training_pairs.parquet.zip


In [10]:
import dask.dataframe as dd

# Define the base path to your Parquet directory
base_path = 'data/training_pairs.parquet/' # Make sure this ends with a slash

# List the specific parts you want to load
parts_to_load = [
    base_path + 'part.0.parquet',
    base_path + 'part.1.parquet',
    base_path + 'part.2.parquet',
    base_path + 'part.3.parquet',
    base_path + 'part.4.parquet',
    base_path + 'part.5.parquet',
    base_path + 'part.6.parquet',
    base_path + 'part.7.parquet',
    base_path + 'part.8.parquet',
    base_path + 'part.9.parquet',
    base_path + 'part.10.parquet',
    base_path + 'part.11.parquet',
    base_path + 'part.12.parquet',
]

# Load the specified parts
df_dd = dd.read_parquet(parts_to_load)

# You can then compute it to a Pandas DataFrame if needed for further processing
# or use Dask operations directly.
training_pairs_df = df_dd.compute()

print(f"Dask DataFrame loaded with {df_dd.npartitions} partitions.")
# print(df_pd.head())
# print(f"Shape of loaded data: {df_pd.shape}")

: 

In [3]:
training_pairs_df.head(20)

Unnamed: 0,user_id,book_id,user_ctx,book_text,label
0,001af7947e217e17694c5a9c097afffb,57854,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Tao Te Ching | Genres: history, literat...",1
1,001af7947e217e17694c5a9c097afffb,15808287,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Mrs. Lincoln's Dressmaker | Genres: bio...,0
2,001af7947e217e17694c5a9c097afffb,3692,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Heart of the Matter | Genres: conte...,0
3,001af7947e217e17694c5a9c097afffb,603515,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: The Hound of Rowan (The Tapestry, #1) |...",0
4,001af7947e217e17694c5a9c097afffb,34,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: The Fellowship of the Ring (The Lord of...,1
5,001af7947e217e17694c5a9c097afffb,73965,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Drinking: A Love Story | Genres: biogra...,0
6,001af7947e217e17694c5a9c097afffb,1215919,Favorite books: Tao Te Ching by Lao Tzu and Gi...,Title: Highlander Untamed (MacLeods of Skye Tr...,0
7,001af7947e217e17694c5a9c097afffb,218038,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: All About Love (Cynster, #6) | Genres: ...",0
8,001af7947e217e17694c5a9c097afffb,7332,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: The Silmarillion | Genres: adventure, a...",1
9,001af7947e217e17694c5a9c097afffb,455930,Favorite books: Tao Te Ching by Lao Tzu and Gi...,"Title: Echo Burning (Jack Reacher, #5) | Genre...",0


In [4]:
print(f"Shape of loaded data: {training_pairs_df.shape}")

Shape of loaded data: (2862473, 5)


In [5]:
import pandas as pd
import dask.dataframe as dd

# Load the training pairs
training_pairs = dd.read_parquet('data/training_pairs.parquet/part.0.parquet')

# Convert to pandas for easier inspection
training_pairs_pd = training_pairs.compute()

# Display basic information
print("Shape of the dataset:", training_pairs_pd.shape)
print("\nColumns:", training_pairs_pd.columns.tolist())
print("\nSample of the data:")
print(training_pairs_pd.head())

# Check the distribution of labels
print("\nLabel distribution:")
print(training_pairs_pd['label'].value_counts())

# Check some example user contexts and book texts
print("\nExample user context:")
print(training_pairs_pd['user_ctx'].iloc[0])
print("\nExample book text:")
print(training_pairs_pd['book_text'].iloc[0])

Shape of the dataset: (936681, 5)

Columns: ['user_id', 'book_id', 'user_ctx', 'book_text', 'label']

Sample of the data:
                            user_id   book_id  \
0  001af7947e217e17694c5a9c097afffb     57854   
1  001af7947e217e17694c5a9c097afffb  15808287   
2  001af7947e217e17694c5a9c097afffb      3692   
3  001af7947e217e17694c5a9c097afffb    603515   
4  001af7947e217e17694c5a9c097afffb        34   

                                            user_ctx  \
0  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
1  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
2  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
3  Favorite books: Tao Te Ching by Lao Tzu and Gi...   
4  Favorite books: Tao Te Ching by Lao Tzu and Gi...   

                                           book_text  label  
0  Title: Tao Te Ching | Genres: history, literat...      1  
1  Title: Mrs. Lincoln's Dressmaker | Genres: bio...      0  
2  Title: The Heart of the Matter | Genres: conte...      0 

In [7]:
# Count positive labels per user
positive_counts = training_pairs_df[training_pairs_df['label'] == 1].groupby('user_id').size()
print("\nNumber of positive examples per user:")
print(positive_counts)

# Get some statistics about the positive counts
print("\nStatistics about positive examples per user:")
print(positive_counts.describe())

# Find users with very few positive examples
users_with_few_positives = positive_counts[positive_counts < 5]
print(f"\nNumber of users with less than 5 positive examples: {len(users_with_few_positives)}")
print("\nExample users with few positives:")
print(users_with_few_positives.head())



Number of positive examples per user:
user_id
0005f52944ea1992e95d61f287acaea9     65
0006260f85929db85eddee3a0bd0e504     20
0006db397ebf02b2e891d1048fb70dbc    166
0006de2967df1ec4432c51090803966e     76
000883382802f2d95a3dd545bb953882    154
                                   ... 
1d364492146d00ceebf9b7ec4e7d45af    298
1d4a2185b490d26a3ab0faedc4adf6c7     42
1d6a5b005de5e4c27945dca1c13d47f2    121
1de44842d3080ec55181e46b0cf16ed1     78
1dfed2c58f01fe899666bd9a6ce319e1    155
Length: 5357, dtype: int64

Statistics about positive examples per user:
count    5357.000000
mean      136.083816
std       101.274552
min         1.000000
25%        77.000000
50%       109.000000
75%       165.000000
max      1478.000000
dtype: float64

Number of users with less than 5 positive examples: 30

Example users with few positives:
user_id
008c374625966c32477ebab37e835a4e    1
00e8157279aa30f4b919aea0a887f49a    2
01e2d286d0361edf8c62bc580d3baa18    1
02ac01d9ebc7165e80d8967f075adbd3    3
0378ae

In [None]:
import dask.dataframe as dd

# Get all unique user IDs
user_ids = training_pairs_pd['user_id'].unique()

print(user_ids)

<ArrowStringArray>
['001af7947e217e17694c5a9c097afffb', '0006260f85929db85eddee3a0bd0e504',
 '000bcda59ab565512f51f9e1f531b5e5', '0005f52944ea1992e95d61f287acaea9',
 '000883382802f2d95a3dd545bb953882', '0006db397ebf02b2e891d1048fb70dbc',
 '0009b61b9879bb2e5b84ce24f43450c8', '00281bdc3b8dd584ca6c5cb867de959f',
 '0006de2967df1ec4432c51090803966e', '002c10ebc541a4303b4d2c0aa2bff335',
 ...
 '090ebce33f677e84d5ee0e8510996a15', '093a06eb1563ef6d2a6c443b5189db47',
 '0a97f788f5707a7f116f5cc16875597e', '0951f343eed8911a4451ae2fa80dc1f3',
 '08c70632f3c4ca2793d221f9d47037fb', '096eb5757df185c7793fac23085a5b62',
 '08c0a7ae8992a65d7792dd9c69b41369', '090fd9cea80dbfb06cf11885af8a1e38',
 '093ef1f64c3baa83e54d9e63a550369d', '089c7ad67ccf2f81c8dc476db53f3235']
Length: 1785, dtype: string


In [9]:
import pandas as pd
import random

# Assuming 'df' is your DataFrame loaded with pd.read_parquet(data_path)
# and has columns: 'user_id', 'label', 'user_ctx', 'book_text'

# --- Configuration ---
MIN_POSITIVES_TO_KEEP_USER = 3
MAX_POSITIVES_TO_SAMPLE = 3  # Changed from 10 to 3
NEGATIVES_PER_POSITIVE = 3
RANDOM_SEED = 42 # For reproducibility

random.seed(RANDOM_SEED)
print(f"Original DataFrame shape: {training_pairs_df.shape}")

# --- Step 1: Identify positive and negative interactions ---
df_positives = training_pairs_df[training_pairs_df['label'] == 1]
df_negatives = training_pairs_df[training_pairs_df['label'] == 0]

# --- Step 2: Process each user ---
selected_positive_samples = []
users_to_process = df_positives['user_id'].unique()

for user_id in users_to_process:
    user_positive_df = df_positives[df_positives['user_id'] == user_id]
    num_user_positives = len(user_positive_df)

    if num_user_positives >= MAX_POSITIVES_TO_SAMPLE:
        # If >= 3 positives, sample 3
        selected_positive_samples.append(user_positive_df.sample(n=MAX_POSITIVES_TO_SAMPLE, random_state=RANDOM_SEED))
    elif num_user_positives >= MIN_POSITIVES_TO_KEEP_USER:
        # If exactly 3 positives, keep them all
        selected_positive_samples.append(user_positive_df)
    # Else (less than 3 positives), drop the user (do nothing here)

# Combine all selected positive samples
if selected_positive_samples:
    final_positives_df = pd.concat(selected_positive_samples).reset_index(drop=True)
else:
    final_positives_df = pd.DataFrame(columns=df.columns) # Empty DataFrame if no users meet criteria

print(f"Number of positive samples after filtering/sampling: {len(final_positives_df)}")
print(f"Number of unique users after filtering/sampling positives: {final_positives_df['user_id'].nunique()}")

# --- Step 3: Sample negatives for each selected positive ---
final_samples_list = []
if not final_positives_df.empty:
    for _, positive_row in final_positives_df.iterrows():
        user_id = positive_row['user_id']
        
        # Add the positive sample
        final_samples_list.append(positive_row.to_dict())
        
        # Get all negative samples for this user
        user_negative_df = df_negatives[df_negatives['user_id'] == user_id]
        
        if not user_negative_df.empty:
            num_negs_to_sample = min(NEGATIVES_PER_POSITIVE, len(user_negative_df))
            if num_negs_to_sample > 0:
                sampled_negatives = user_negative_df.sample(n=num_negs_to_sample, random_state=RANDOM_SEED)
                for _, neg_row in sampled_negatives.iterrows():
                    final_samples_list.append(neg_row.to_dict())

# Create the final DataFrame
processed_df = pd.DataFrame(final_samples_list)

if not processed_df.empty:
    # Ensure correct dtypes, especially for label
    processed_df['label'] = processed_df['label'].astype(int)
    print(f"Final processed DataFrame shape: {processed_df.shape}")
    print(f"Label distribution in final DataFrame:\n{processed_df['label'].value_counts(normalize=True)}")
    print(f"Number of unique users in final DataFrame: {processed_df['user_id'].nunique()}")
else:
    print("No samples met the criteria. The processed DataFrame is empty.")

# Now, 'processed_df' is your new DataFrame. You can then split this into train/test.
# For example:
# users = processed_df['user_id'].unique()
# train_users, test_users = train_test_split(users, test_size=test_size, random_state=random_seed)
# train_df = processed_df[processed_df['user_id'].isin(train_users)].reset_index(drop=True)
# test_df  = processed_df[processed_df['user_id'].isin(test_users)].reset_index(drop=True)
# print(f"Train pairs: {len(train_df)}, Test pairs: {len(test_df)}")

Original DataFrame shape: (2862473, 5)
Number of positive samples after filtering/sampling: 16017
Number of unique users after filtering/sampling positives: 5339
Final processed DataFrame shape: (64068, 5)
Label distribution in final DataFrame:
label
0    0.75
1    0.25
Name: proportion, dtype: float64
Number of unique users in final DataFrame: 5339


## 2. Configuration

In [None]:
# Paths
data_path = 'data/training_pairs.parquet'
output_model_dir = 


epochs = 2
batch_size = 16
learning_rate = 2e-5
warmup_steps = 100
random_seed = 42

In [None]:
# Reproducibility
random.seed(random_seed)

## 3. Load & Split Data

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# 1) Load all pairs and drop book_id
df = pd.read_parquet(data_path)
df = df.drop('book_id', axis=1)

# 2) Get unique users
users = df['user_id'].unique()

# 3) First split: 80% train, 20% temp (val+test)
train_users, temp_users = train_test_split(
    users, 
    test_size=0.2, 
    random_state=random_seed
)

# 4) Second split: half of temp → val (10%), half → test (10%)
val_users, test_users = train_test_split(
    temp_users, 
    test_size=0.5, 
    random_state=random_seed
)

# 5) Build DataFrames
train_df = df[df['user_id'].isin(train_users)].reset_index(drop=True)
val_df   = df[df['user_id'].isin(val_users)].reset_index(drop=True)
test_df  = df[df['user_id'].isin(test_users)].reset_index(drop=True)

print(f"Train pairs: {len(train_df)}, Val pairs: {len(val_df)}, Test pairs: {len(test_df)}")

## 4. Prepare InputExamples & DataLoaders

In [None]:
# Convert to InputExample
train_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in train_df.itertuples()
]

val_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in val_df.itertuples()
]

test_examples = [
    InputExample(texts=[row.user_ctx, row.book_text], label=float(row.label))
    for row in test_df.itertuples()
]

# DataLoaders
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=batch_size)
val_dataloader   = DataLoader(val_examples, shuffle=False, batch_size=batch_size)
test_dataloader  = DataLoader(test_examples, shuffle=False, batch_size=batch_size)


## 5. Define Validation Evaluator

## 5. Instantiate & Fine-Tune CrossEncoder

cross-encoder/ms-marco-MiniLM-L-6-v2 is a 6-layer MiniLM distilled into a cross-encoder architecture and pretrained on the MS MARCO passage ranking task. It takes a paired input (e.g. user context + book text) and produces a single relevance score via full token-level attention.

Justification
	•	Ranking-Tuned Pretraining
Its MS MARCO heritage means it already knows how to judge fine-grained relevance patterns—crucial for matching nuanced book descriptions to user tastes.
	•	Speed-Quality Sweet Spot
At ~60 MB and with inference under 15 ms per candidate, it delivers ~90–95 % of full BERT-base accuracy, keeping end-to-end latency low.
	•	Efficient Fine-Tuning
Requires only 2–3 epochs over ~150 K (user,book) pairs to adapt deeply to book-domain language, making rapid iteration feasible.
	•	Compact & Deployable
Its small footprint simplifies packaging, loading, and scaling in production environments with moderate memory and compute budgets.

In [None]:
from sentence_transformers import CrossEncoder
model = CrossEncoder(
    'cross-encoder/ms-marco-MiniLM-L-6-v2',
    num_labels=1,
    max_length=256,
    loss_fct='cross_entropy'  
)


from sentence_transformers.evaluation import CrossEncoderEvaluator

evaluator = CrossEncoderEvaluator.from_input_examples(
    val_examples,     # list of InputExample for validation
    name='val',
    batch_size=batch_size,
    main_score_function=lambda y_true, y_pred: ndcg_score([y_true], [y_pred], k=3)
)

model.fit(
    train_dataloader=train_dataloader,
    evaluator=evaluator,
    evaluation_steps=len(train_dataloader),  # run eval once per epoch
    early_stopping=True,
    use_amp=True,             # alias for fp16 in newer versions
    output_path=output_model_dir
)

## 6. Evaluation on Test Set

In [None]:
# Prepare test pairs for prediction
test_pairs = [[row.user_ctx, row.book_text] for row in test_df.itertuples()]
scores = model.predict(test_pairs)
labels = test_df['label'].values

# Compute metrics
auc = roc_auc_score(labels, scores)
ap = average_precision_score(labels, scores)
ndcg = ndcg_score([labels], [scores], k=10)

print(f"ROC AUC: {auc:.4f}")
print(f"Average Precision: {ap:.4f}")
print(f"NDCG@10: {ndcg:.4f}")


## 7. Save & Load Model

In [None]:
# Model is already saved during fit
# To load:
from sentence_transformers import CrossEncoder
loaded_model = CrossEncoder(output_model_dir)


## 8. Inference Example

In [None]:
# Given a single user and its candidates
user_ctx = "Favorite books: ..."  # fetched or precomputed
candidate_texts = ["Title: ... Description: ...", ...]
pairs = [[user_ctx, txt] for txt in candidate_texts]
scores = loaded_model.predict(pairs)

# Rerank
candidates = ['book1', 'book2', ...]
ranked = sorted(zip(candidates, scores), key=lambda x: x[1], reverse=True)
print(ranked[:10])