# Ranking with XGBoost


- Load Amazon Beauty train/valid/test interaction splits plus item metadata
- Perform 1:1 negative sampling, categorical encoding, and timestamp normalization
- Train/evaluate an XGBoost ranker with automatic GPU/CPU selection
- Persist the trained booster and metrics under a configurable output directory




In [2]:
import json
import os
import random
import subprocess
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder

try:
    import xgboost as xgb
except ImportError as exc:
    raise ImportError(
        "Install xgboost (CPU or GPU build) before running."
    ) from exc

print("Using XGBoost version:", xgb.__version__)



Using XGBoost version: 3.1.2


In [3]:

# Configurable paths & hyperparameters 

DATA_DIR = os.environ.get("DATA_DIR", "dataset/amazon-beauty")
INTER_PREFIX = os.path.join(DATA_DIR, "amazon-beauty")
ITEM_FILE = os.environ.get("ITEM_FILE", f"{INTER_PREFIX}.item")
OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "saved_models")
NUM_NEG = int(os.environ.get("NUM_NEG", 1))
SEED = int(os.environ.get("SEED", 42))
VALID_SEED = SEED + 1
TEST_SEED = SEED + 2
GPU_ID = int(os.environ.get("GPU_ID", 0))

os.makedirs(OUTPUT_DIR, exist_ok=True)
random.seed(SEED)
np.random.seed(SEED)

print(f"DATA_DIR: {DATA_DIR}")
print(f"ITEM_FILE: {ITEM_FILE}")
print(f"OUTPUT_DIR: {OUTPUT_DIR}")
print(f"NUM_NEG: {NUM_NEG}")
print(f"SEED: {SEED}")



DATA_DIR: dataset/amazon-beauty
ITEM_FILE: dataset/amazon-beauty/amazon-beauty.item
OUTPUT_DIR: saved_models
NUM_NEG: 1
SEED: 42


In [4]:
"""Read RecBole interaction split (.inter)."""
def load_interactions(split: str) -> pd.DataFrame:
    
    path = f"{INTER_PREFIX}-{split}.inter"
    df = pd.read_csv(path, sep="\t")
    if "label" not in df.columns:
        raise ValueError(f"'label' column not found in {path}")
    return df

"""Load item metadata and keep the fields used by the ranker."""
def load_item_features() -> pd.DataFrame:
    
    item_df = pd.read_csv(ITEM_FILE, sep="\t")
    rename_map = {
        "item_id:token": "item_id",
        "sales_rank:float": "sales_rank",
        "price:float": "price",
        "brand:token": "brand",
        "categories:token_seq": "categories",
    }
    item_df = item_df.rename(columns=rename_map)
    return item_df["item_id sales_rank price brand categories".split()]


def extract_primary_category(cat_str: str) -> str:
    if pd.isna(cat_str) or not str(cat_str).strip():
        return "Unknown"
    tokens = [c.strip().strip("'\"") for c in str(cat_str).split(",")]
    return tokens[0] if tokens else "Unknown"

"""Cache positive items per user across splits for neg sampling."""
def build_user_pos_items(df_list):
    
    user_pos = defaultdict(set)
    for df in df_list:
        for row in df[["user_id", "item_id"]].itertuples(index=False):
            user_pos[row.user_id].add(row.item_id)
    return user_pos

"""Uniform negative sampling per interaction (1 negative per positive)."""
def sample_negatives(df, user_pos_items, all_items, num_neg=1, seed=42):
    
    rng = random.Random(seed)
    negatives = []
    for row in df.itertuples(index=False):
        user = row.user_id
        for _ in range(num_neg):
            while True:
                neg_item = rng.choice(all_items)
                if neg_item not in user_pos_items[user]:
                    negatives.append(
                        {
                            "user_id": user,
                            "item_id": neg_item,
                            "timestamp": row.timestamp,
                            "label": 0,
                        }
                    )
                    break
    neg_df = pd.DataFrame(negatives)
    return pd.concat([df, neg_df], ignore_index=True)



In [5]:
def encode_features(train_df, valid_df, test_df, item_df):
    item_df = item_df.copy()
    item_df["primary_category"] = item_df["categories"].apply(extract_primary_category)
    item_df["price"] = pd.to_numeric(item_df["price"], errors="coerce")
    item_df["sales_rank"] = pd.to_numeric(item_df["sales_rank"], errors="coerce")
    item_df["price"] = item_df["price"].fillna(item_df["price"].median()) # why median: Price distributions are often right-skewed, mean is pulled up by outliers; median is not 
    item_df["sales_rank"] = item_df["sales_rank"].fillna(item_df["sales_rank"].median())
    item_df["brand"] = item_df["brand"].fillna("Unknown")
    item_df["primary_category"] = item_df["primary_category"].fillna("Unknown")

    def merge_item_feat(df):
        return df.merge(item_df, on="item_id", how="left")

    train_df = merge_item_feat(train_df)
    valid_df = merge_item_feat(valid_df)
    test_df = merge_item_feat(test_df)

    cat_cols = ["user_id", "item_id", "brand", "primary_category"]
    # Ensure every categorical column is string before concatenation/encoding
    for df in (train_df, valid_df, test_df):
        for col in cat_cols:
            df[col] = df[col].astype(str)

    encoders = {col: LabelEncoder() for col in cat_cols}
    combined = pd.concat([train_df[cat_cols], valid_df[cat_cols], test_df[cat_cols]])
    for col in cat_cols:
        # Step 1: FIT - Learn the mapping: This builds a vocabulary: {"user_123": 0, "user_456": 1, ...}
        encoders[col].fit(combined[col])
        # Step 2: TRANSFORM - Apply the mapping: This converts: "user_123" → 0, "user_456" → 1, ...
        train_df[col + "_idx"] = encoders[col].transform(train_df[col])
        valid_df[col + "_idx"] = encoders[col].transform(valid_df[col])
        test_df[col + "_idx"] = encoders[col].transform(test_df[col])

    for df in (train_df, valid_df, test_df):
        df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce").fillna(0)
        df["timestamp_days"] = (df["timestamp"] / 86400).astype(np.float32) #converts Unix timestamps (seconds since epoch) to days, Other features (e.g., price, sales_rank) are on different scales(price: 25.99 small scale, if use timestamps is large scale:1388534400) 

    feature_cols = [
        "user_id_idx",
        "item_id_idx",
        "price",
        "sales_rank",
        "brand_idx",
        "primary_category_idx",
        "timestamp_days",
    ]

    return train_df, valid_df, test_df, feature_cols

"""Auto-select GPU or CPU device (XGBoost 3.1+ API)."""
def detect_device():
    try:
        result = subprocess.run(
            ["nvidia-smi"], capture_output=True, text=True, timeout=2, check=False
        )
        gpu_present = result.returncode == 0
    except FileNotFoundError:
        gpu_present = False
    except Exception:
        gpu_present = False

    if gpu_present:
        device = f"cuda:{GPU_ID}"
        print(f"GPU detected - using device={device}")
        return device
    else:
        print("No GPU detected - using device=cpu")
        return "cpu"


def train_xgboost(train_df, valid_df, feature_cols):
    train_dmatrix = xgb.DMatrix(train_df[feature_cols], label=train_df["label"])
    valid_dmatrix = xgb.DMatrix(valid_df[feature_cols], label=valid_df["label"])

    params = {
        "objective": "binary:logistic", # binary classification with logistic regression output (probabilities 0 to 1 via sigmoid)
        "eval_metric": ["auc", "logloss"], # area under ROC curve (AUC) (higher is better, 0-1), LogLoss (lower is better)
        "device": detect_device(), 
        "tree_method": "hist", # histogram method (works for both CPU and GPU in 3.1+)
        "eta": 0.05, # learning rate
        "max_depth": 8, # Maximum depth of each tree
        "subsample": 0.8, # Fraction of training samples used per tree (Uses 80% of rows per tree)
        "colsample_bytree": 0.8, # Fraction of features used per tree
        "min_child_weight": 3, #Minimum sum of instance weights (Hessian) in a child node, for binary classification, roughly minimum samples per leaf, prevents splits that create very small leaves
        "lambda": 1.0, # L2 regularization on leaf weights, penalizes large leaf values, prevents overfitting
    }
    evals = [(train_dmatrix, "train"), (valid_dmatrix, "valid")]
    model = xgb.train(
        params,
        train_dmatrix,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=30,
        verbose_eval=50, # Print metrics every 50 rounds
    )
    return model


def evaluate_model(model, df, feature_cols):
    dmatrix = xgb.DMatrix(df[feature_cols])
    preds = model.predict(dmatrix)
    # Clip predictions to avoid log(0) issues
    preds_clipped = np.clip(preds, 1e-15, 1 - 1e-15)
    auc = roc_auc_score(df["label"], preds)
    ll = log_loss(df["label"], preds_clipped)
    return {"AUC": float(auc), "LogLoss": float(ll)}



In [6]:
def run_pipeline():
    print("Loading interactions + metadata")
    train_df = load_interactions("train")
    valid_df = load_interactions("valid")
    test_df = load_interactions("test")
    item_df = load_item_features()

    print(f"Sampling {NUM_NEG}:1 negatives per positive")
    all_items = item_df["item_id"].unique().tolist()
    user_pos_items = build_user_pos_items([train_df, valid_df, test_df])
    train_df_ns = sample_negatives(train_df, user_pos_items, all_items, NUM_NEG, SEED)
    valid_df_ns = sample_negatives(valid_df, user_pos_items, all_items, NUM_NEG, VALID_SEED)
    test_df_ns = sample_negatives(test_df, user_pos_items, all_items, NUM_NEG, TEST_SEED)

    print("Encoding categorical + numerical features")
    train_df_enc, valid_df_enc, test_df_enc, feature_cols = encode_features(
        train_df_ns, valid_df_ns, test_df_ns, item_df
    )

    print("Training XGBoost")
    model = train_xgboost(train_df_enc, valid_df_enc, feature_cols)

    print("Evaluating")
    train_metrics = evaluate_model(model, train_df_enc, feature_cols)
    valid_metrics = evaluate_model(model, valid_df_enc, feature_cols)
    test_metrics = evaluate_model(model, test_df_enc, feature_cols)

    results = {
        "train": train_metrics,
        "valid": valid_metrics,
        "test": test_metrics,
        "feature_cols": feature_cols,
        "params": {
            "num_neg": NUM_NEG,
            "seed": SEED,
            "tree_method": model.attributes().get("tree_method", "unknown"),
        },
    }

    model_path = os.path.join(OUTPUT_DIR, "ranking_xgboost.model")
    metrics_path = os.path.join(OUTPUT_DIR, "ranking_xgboost_results.json")

    model.save_model(model_path)
    with open(metrics_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)

    print(json.dumps(results, indent=2))
    print(f"\nModel saved to: {model_path}")
    print(f"Metrics saved to: {metrics_path}")
    return results



In [7]:
%%time
results = run_pipeline()



Loading interactions + metadata
Sampling 1:1 negatives per positive
Encoding categorical + numerical features
Training XGBoost
GPU detected - using device=cuda:0
[0]	train-auc:1.00000	train-logloss:0.64440	valid-auc:1.00000	valid-logloss:0.64440
[50]	train-auc:1.00000	train-logloss:0.03915	valid-auc:1.00000	valid-logloss:0.03915
[100]	train-auc:1.00000	train-logloss:0.00315	valid-auc:1.00000	valid-logloss:0.00315
[150]	train-auc:1.00000	train-logloss:0.00026	valid-auc:1.00000	valid-logloss:0.00026
[200]	train-auc:1.00000	train-logloss:0.00002	valid-auc:1.00000	valid-logloss:0.00002
[250]	train-auc:1.00000	train-logloss:0.00000	valid-auc:1.00000	valid-logloss:0.00000
[286]	train-auc:1.00000	train-logloss:0.00000	valid-auc:1.00000	valid-logloss:0.00000
Evaluating


  model.save_model(model_path)


{
  "train": {
    "AUC": 1.0,
    "LogLoss": 2.384188633191098e-06
  },
  "valid": {
    "AUC": 1.0,
    "LogLoss": 2.384188633191085e-06
  },
  "test": {
    "AUC": 1.0,
    "LogLoss": 2.3841886331910874e-06
  },
  "feature_cols": [
    "user_id_idx",
    "item_id_idx",
    "price",
    "sales_rank",
    "brand_idx",
    "primary_category_idx",
    "timestamp_days"
  ],
  "params": {
    "num_neg": 1,
    "seed": 42,
    "tree_method": "unknown"
  }
}

Model saved to: saved_models/ranking_xgboost.model
Metrics saved to: saved_models/ranking_xgboost_results.json
CPU times: user 33.4 s, sys: 2.11 s, total: 35.5 s
Wall time: 35.8 s


In [8]:
# AUC = 1.0 on train, valid, AND test don't seem correct, perfect scores.

# The model might memorize (user_id, item_id) pairs rather than learning preferences:
# Positive samples: Real interactions — specific (user, item) combinations
# Negative samples: Random items the user never interacted with
# XGBoost could trivially learn: These exact (user_id_idx, item_id_idx) combos are positive; everything else is negative. 
# since validation/test use the same random negative strategy, it generalizes perfectly.

In [9]:
## Experiment: Remove user_id_idx and item_id_idx

# The previous run showed **AUC = 1.0** on all splits, which indicates data leakage. The model memorized (user, item) pairs instead of learning generalizable patterns.

# **Hypothesis**: Removing `user_id_idx` and `item_id_idx` will force the model to learn from content features only (price, sales_rank, brand, category, timestamp), resulting in realistic performance.

In [10]:
# Experiment: Content-only features (no user/item IDs)
def run_experiment_no_ids():
    print("=" * 60)
    print("EXPERIMENT: XGBoost with content features only (no IDs)")
    print("=" * 60)
    
    print("\nLoading interactions + metadata")
    train_df = load_interactions("train")
    valid_df = load_interactions("valid")
    test_df = load_interactions("test")
    item_df = load_item_features()

    print(f"Sampling {NUM_NEG}:1 negatives per positive")
    all_items = item_df["item_id"].unique().tolist()
    user_pos_items = build_user_pos_items([train_df, valid_df, test_df])
    train_df_ns = sample_negatives(train_df, user_pos_items, all_items, NUM_NEG, SEED)
    valid_df_ns = sample_negatives(valid_df, user_pos_items, all_items, NUM_NEG, VALID_SEED)
    test_df_ns = sample_negatives(test_df, user_pos_items, all_items, NUM_NEG, TEST_SEED)

    print("Encoding features")
    train_df_enc, valid_df_enc, test_df_enc, _ = encode_features(
        train_df_ns, valid_df_ns, test_df_ns, item_df
    )

    # Content-only features (REMOVED user_id_idx and item_id_idx)
    feature_cols_no_ids = [
        "price",
        "sales_rank",
        "brand_idx",
        "primary_category_idx",
        "timestamp_days",
    ]
    
    print(f"\nFeatures used: {feature_cols_no_ids}")
    print(f"Number of features: {len(feature_cols_no_ids)}")

    print("\nTraining XGBoost (content features only)")
    model = train_xgboost(train_df_enc, valid_df_enc, feature_cols_no_ids)

    print("\nEvaluating")
    train_metrics = evaluate_model(model, train_df_enc, feature_cols_no_ids)
    valid_metrics = evaluate_model(model, valid_df_enc, feature_cols_no_ids)
    test_metrics = evaluate_model(model, test_df_enc, feature_cols_no_ids)

    results = {
        "experiment": "content_features_only",
        "train": train_metrics,
        "valid": valid_metrics,
        "test": test_metrics,
        "feature_cols": feature_cols_no_ids,
    }

    print("\n" + "=" * 60)
    print("RESULTS (Content Features Only)")
    print("=" * 60)
    print(json.dumps(results, indent=2))
    
    # Save experiment results
    exp_path = os.path.join(OUTPUT_DIR, "ranking_xgboost_no_ids_results.json")
    with open(exp_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved to: {exp_path}")
    
    return results


In [11]:
%%time
exp_results = run_experiment_no_ids()


EXPERIMENT: XGBoost with content features only (no IDs)

Loading interactions + metadata
Sampling 1:1 negatives per positive
Encoding features

Features used: ['price', 'sales_rank', 'brand_idx', 'primary_category_idx', 'timestamp_days']
Number of features: 5

Training XGBoost (content features only)
GPU detected - using device=cuda:0
[0]	train-auc:1.00000	train-logloss:0.64440	valid-auc:1.00000	valid-logloss:0.64440
[50]	train-auc:1.00000	train-logloss:0.03915	valid-auc:1.00000	valid-logloss:0.03915
[100]	train-auc:1.00000	train-logloss:0.00315	valid-auc:1.00000	valid-logloss:0.00315
[150]	train-auc:1.00000	train-logloss:0.00026	valid-auc:1.00000	valid-logloss:0.00026
[200]	train-auc:1.00000	train-logloss:0.00002	valid-auc:1.00000	valid-logloss:0.00002
[250]	train-auc:1.00000	train-logloss:0.00000	valid-auc:1.00000	valid-logloss:0.00000
[286]	train-auc:1.00000	train-logloss:0.00000	valid-auc:1.00000	valid-logloss:0.00000

Evaluating

RESULTS (Content Features Only)
{
  "experiment": "

In [12]:
## Experiment 2: Negatives from Items with Interactions Only

# The previous experiment still showed AUC = 1.0 because:
# - **Positive items** = Items someone bought (have good sales_rank, popular brands)
# - **Random negatives** = Many "dead" items nobody ever bought

# **Fix**: Sample negatives only from items that have at least one interaction. 
# This creates a more realistic task: Given two items that people actually buy, which one will this user prefer?


In [13]:
# Experiment 2: Sample negatives only from items with interactions
def sample_negatives_from_interacted(df, user_pos_items, interacted_items, num_neg=1, seed=42):
    """Sample negatives only from items that have at least one interaction.
    
    This creates realistic negatives: instead of random catalog items (many "dead"),
    we sample from items that OTHER users have bought.
    """
    rng = random.Random(seed)
    interacted_list = list(interacted_items)
    negatives = []
    
    for row in df.itertuples(index=False):
        user = row.user_id
        for _ in range(num_neg):
            attempts = 0
            while attempts < 100:  # Prevent infinite loop
                neg_item = rng.choice(interacted_list)
                if neg_item not in user_pos_items[user]:
                    negatives.append({
                        "user_id": user,
                        "item_id": neg_item,
                        "timestamp": row.timestamp,
                        "label": 0,
                    })
                    break
                attempts += 1
    
    neg_df = pd.DataFrame(negatives)
    return pd.concat([df, neg_df], ignore_index=True)


def run_experiment_hard_negatives():
    print("=" * 70)
    print("EXPERIMENT 2: Negatives from items with interactions only")
    print("=" * 70)
    
    print("\nLoading interactions + metadata")
    train_df = load_interactions("train")
    valid_df = load_interactions("valid")
    test_df = load_interactions("test")
    item_df = load_item_features()

    # Get items that have at least one interaction (across all splits)
    all_interacted_items = set(train_df["item_id"].unique()) | \
                           set(valid_df["item_id"].unique()) | \
                           set(test_df["item_id"].unique())
    
    all_catalog_items = set(item_df["item_id"].unique())
    
    print(f"\nItem statistics:")
    print(f"  Total items in catalog: {len(all_catalog_items):,}")
    print(f"  Items with ≥1 interaction: {len(all_interacted_items):,}")
    print(f"  Items with 0 interactions: {len(all_catalog_items - all_interacted_items):,}")
    print(f"  Negatives will be sampled from {len(all_interacted_items):,} 'real' items")

    user_pos_items = build_user_pos_items([train_df, valid_df, test_df])
    
    print(f"\nSampling {NUM_NEG}:1 hard negatives (from interacted items only)")
    train_df_ns = sample_negatives_from_interacted(train_df, user_pos_items, all_interacted_items, NUM_NEG, SEED)
    valid_df_ns = sample_negatives_from_interacted(valid_df, user_pos_items, all_interacted_items, NUM_NEG, VALID_SEED)
    test_df_ns = sample_negatives_from_interacted(test_df, user_pos_items, all_interacted_items, NUM_NEG, TEST_SEED)

    # convert item_id to string in all dataframes before merge
    for df in (train_df_ns, valid_df_ns, test_df_ns):
        df["item_id"] = df["item_id"].astype(str)
    item_df["item_id"] = item_df["item_id"].astype(str)

    print("Encoding features")
    train_df_enc, valid_df_enc, test_df_enc, _ = encode_features(
        train_df_ns, valid_df_ns, test_df_ns, item_df
    )

    # Test both feature sets
    feature_cols_with_ids = [
        "user_id_idx", "item_id_idx", "price", "sales_rank",
        "brand_idx", "primary_category_idx", "timestamp_days",
    ]
    feature_cols_no_ids = [
        "price", "sales_rank", "brand_idx", "primary_category_idx", "timestamp_days",
    ]

    results = {}
    
    # --- With IDs ---
    print("\n" + "-" * 50)
    print("Training XGBoost WITH user/item IDs")
    print("-" * 50)
    model_ids = train_xgboost(train_df_enc, valid_df_enc, feature_cols_with_ids)
    results["with_ids"] = {
        "train": evaluate_model(model_ids, train_df_enc, feature_cols_with_ids),
        "valid": evaluate_model(model_ids, valid_df_enc, feature_cols_with_ids),
        "test": evaluate_model(model_ids, test_df_enc, feature_cols_with_ids),
    }
    
    # --- Without IDs ---
    print("\n" + "-" * 50)
    print("Training XGBoost WITHOUT user/item IDs (content only)")
    print("-" * 50)
    model_no_ids = train_xgboost(train_df_enc, valid_df_enc, feature_cols_no_ids)
    results["without_ids"] = {
        "train": evaluate_model(model_no_ids, train_df_enc, feature_cols_no_ids),
        "valid": evaluate_model(model_no_ids, valid_df_enc, feature_cols_no_ids),
        "test": evaluate_model(model_no_ids, test_df_enc, feature_cols_no_ids),
    }

    print("\n" + "=" * 70)
    print("RESULTS COMPARISON (Hard Negatives)")
    print("=" * 70)
    print("\n WITH user/item IDs:")
    print(f"   Train AUC: {results['with_ids']['train']['AUC']:.4f}")
    print(f"   Valid AUC: {results['with_ids']['valid']['AUC']:.4f}")
    print(f"   Test AUC:  {results['with_ids']['test']['AUC']:.4f}")
    
    print("\n WITHOUT user/item IDs (content only):")
    print(f"   Train AUC: {results['without_ids']['train']['AUC']:.4f}")
    print(f"   Valid AUC: {results['without_ids']['valid']['AUC']:.4f}")
    print(f"   Test AUC:  {results['without_ids']['test']['AUC']:.4f}")
    
    # Save results
    exp_path = os.path.join(OUTPUT_DIR, "ranking_xgboost_hard_negatives.json")
    with open(exp_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2)
    print(f"\nSaved to: {exp_path}")
    
    return results


In [14]:
%%time
hard_neg_results = run_experiment_hard_negatives()


EXPERIMENT 2: Negatives from items with interactions only

Loading interactions + metadata

Item statistics:
  Total items in catalog: 259,204
  Items with ≥1 interaction: 249,274
  Items with 0 interactions: 259,204
  Negatives will be sampled from 249,274 'real' items

Sampling 1:1 hard negatives (from interacted items only)
Encoding features

--------------------------------------------------
Training XGBoost WITH user/item IDs
--------------------------------------------------
GPU detected - using device=cuda:0
[0]	train-auc:0.84247	train-logloss:0.67497	valid-auc:0.61984	valid-logloss:0.68855
[50]	train-auc:0.87601	train-logloss:0.48003	valid-auc:0.63635	valid-logloss:0.68322
[55]	train-auc:0.87654	train-logloss:0.47719	valid-auc:0.63624	valid-logloss:0.68435

--------------------------------------------------
Training XGBoost WITHOUT user/item IDs (content only)
--------------------------------------------------
GPU detected - using device=cuda:0
[0]	train-auc:0.50000	train-loglo

In [None]:
# WITH user/item IDs: AUC = 0.64 (Valid/Test)
# Train AUC = 0.877: Model memorizes some (user, item) patterns
# Valid/Test AUC = 0.636: Generalizes moderately to unseen data
# Gap (0.877 to 0.636): might be overfitting, but still useful
# the model learns collaborative filtering patterns: which users tend to buy which items
# This seems to be a reasonable baseline for a ranking model. Test AUC of 0.64 means the model ranks the true positive item higher than a random negative 64% of the time.

# WITHOUT user/item IDs: AUC = 0.500
# AUC = 0.5: Random guessing 
# Content features alone (price, sales_rank, brand, category, timestamp) provide zero predictive signal for user preferences
# Two items that different users bought have similar content features (both are "real" products people buy). Without knowing WHO the user is and WHICH item it is, content features can't distinguish preference.