# Transaction Prediction Model

This notebook focuses on building a predictive model for transaction counts per customer.

## Approach
- **Problem formulation**: Conditional count modeling (predicting number of transactions)
- **Model**: GLM with Tweedie loss (1 < p < 2) as approximation to Negative Binomial
- **Features**: Time-aware features (recency, frequency, rolling windows)
- **Temporal splitting**: Strict time-based train/test splits to avoid leakage

In [11]:
# Core imports
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import sys
from pathlib import Path

# Bootstrap import path for notebooks: add repo root so we can import `src.*`.
# IMPORTANT: we intentionally do NOT use this variable for paths; paths come from src.config.
_BOOTSTRAP_ROOT = Path().resolve().parent
if str(_BOOTSTRAP_ROOT) not in sys.path:
    sys.path.insert(0, str(_BOOTSTRAP_ROOT))

# Import project configuration (single source of truth for paths)
from src.config import PROJECT_ROOT, FIGURES_DIR, OUTPUTS_DIR, MODELS_DIR, DATA_PROCESSED, DEFAULT_TWEEDIE_POWER, DEFAULT_XGB_VAL_OFFSET_MONTHS, DEFAULT_XGB_MAX_DEPTH, DEFAULT_XGB_EARLY_STOPPING_ROUNDS, ensure_directories

# Ensure directories exist
ensure_directories()

# Use config paths
OUTPUTS = OUTPUTS_DIR
FIGURES = FIGURES_DIR

print("✓ Imports loaded")
print(f"✓ Project root: {PROJECT_ROOT}")

✓ Imports loaded
✓ Project root: /mnt/c/Users/zaido/OneDrive/Bureau/Quod_THA


In [12]:
# Load processed data
from src.data import load_and_process_transactions
import importlib, src.data as data
importlib.reload(data)

df = load_and_process_transactions(force_reprocess=False)

print(f"Loaded {len(df):,} transactions")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Unique customers: {df['customer_id'].nunique():,}")
df.head()

Loading processed data from /mnt/c/Users/zaido/OneDrive/Bureau/Quod_THA/data/processed/transactions_cleaned.csv


Loaded 299,575 transactions
Date range: 2017-01-01 00:00:00 to 2020-03-17 00:00:00
Unique customers: 2,002


Unnamed: 0,customer_id,product_id,date
0,9447359,Nissan,2017-01-01
1,1435072,Fiat,2017-01-01
2,5391951,Opel,2017-01-01
3,5391951,Volkswagen,2017-01-01
4,1435072,Peugeot,2017-01-01


## Feature Engineering

Build time-aware features for customer $i$ at cutoff time $t$:

- $N_i(t-1, t)$: Transactions in last 1 month
- $N_i(t-3, t)$: Transactions in last 3 months
- $N_i(t-6, t)$: Transactions in last 6 months
- $N_i(t-3, t) - N_i(t-6, t-3)$: Change in transaction rate
- `days_since_last_tx(i, t)`: Recency feature
- `active_months(i, t)`: Number of active months
- `month_of_year(t)`: Seasonal feature

In [13]:
# Feature Engineering (imported from src)
from src.features import create_feature_vector, create_feature_matrix

# Quick smoke-check: one customer at one cutoff
test_customer = df["customer_id"].iloc[0]
test_cutoff = pd.Timestamp("2019-01-31")

print(f"Testing feature engineering for customer {test_customer} at {test_cutoff}")
features = create_feature_vector(df, test_customer, test_cutoff)
print("\nFeature vector:")
print(features)


Testing feature engineering for customer 9447359 at 2019-01-31 00:00:00

Feature vector:
n_transactions_1m     11.0
n_transactions_3m     46.0
n_transactions_6m     74.0
change_rate_3m        18.0
days_since_last_tx     6.0
active_months         25.0
month_of_year          1.0
dtype: float64


In [14]:
# # Create feature matrix for all customers at a specific cutoff date
# # Example: features as of end of January 2019 (for predicting Feb-Apr 2019)
# cutoff_date = pd.Timestamp('2019-01-31')

# print(f"Creating features at cutoff {cutoff_date}")
# print("This may take a moment...")
# print("(Customers without purchase history before cutoff will be automatically dropped)\n")

# # Create feature matrix - single cutoff date for all customers
# # Only customers with at least one transaction before cutoff_date will be included
# feature_matrix = create_feature_matrix(df, cutoff_date)

# print(f"\nFeature matrix shape: {feature_matrix.shape}")
# print("\nFeature summary statistics:")
# print(feature_matrix.describe())
# print("\nFirst few rows:")
# feature_matrix.head(10)

In [15]:
# Comprehensive no-leakage panel dataset (precompute ONCE for all possible cutoffs)
#
# Cutoff rule (no leakage):
# - Features X_{i,t} use ONLY transactions strictly before the cutoff: date < t
# - Labels   y_{i,t} count transactions in the future horizon:        [t, t + horizon)
#
# We build a comprehensive panel for every valid month-end cutoff from the earliest
# possible prediction month (requires 6m history) up to (last_date - horizon).
# Then for any cutoff, we slice the already-built panel instead of recomputing features.

from src.panel import build_or_load_full_panel_dataset
from src.builder import (
    baseline_a_predict,
    train_tweedie_model,
    tweedie_predict,
    train_or_load_xgb_poisson_for_cutoff,
    evaluate_xgb_poisson_on_cutoff,
)

HORIZON_MONTHS = 3

panel = build_or_load_full_panel_dataset(
    df,
    horizon_months=HORIZON_MONTHS,
    min_history_months=6,
    cache_path=PROJECT_ROOT / f"data/processed/panel_h{HORIZON_MONTHS}_mh6.joblib",
    force_rebuild=False,
)

print(
    f"Full panel built/loaded: X={panel.X.shape}, #cutoffs={len(panel.cutoffs)} "
    f"({panel.cutoffs[0].date()} .. {panel.cutoffs[-1].date()})"
)


Full panel built/loaded: X=(52121, 7), #cutoffs=30 (2017-07-31 .. 2019-12-31)


In [16]:
# Run a few time-based "tests" in 2019 using the prebuilt full panel.
# For each cutoff T:
# - train uses only cutoffs <= (T - horizon)
# - XGBoost early stopping uses validation cutoff V = (T - 2 months), never touching T

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_tweedie_deviance

POWER = DEFAULT_TWEEDIE_POWER
EPS_DEV = 1e-9

cutoffs_2019 = [
    pd.Timestamp("2019-01-31"),
    pd.Timestamp("2019-03-31"),
    pd.Timestamp("2019-07-31"),
]

# XGB configuration (passed into the training function)
XGB_VAL_OFFSET_MONTHS = DEFAULT_XGB_VAL_OFFSET_MONTHS
XGB_MAX_DEPTH = DEFAULT_XGB_MAX_DEPTH
XGB_N_ESTIMATORS_MAX = 5000
XGB_EARLY_STOPPING_ROUNDS = DEFAULT_XGB_EARLY_STOPPING_ROUNDS

for cutoff in cutoffs_2019:
    cutoff = cutoff.normalize() + pd.offsets.MonthEnd(0)
    last_train_cutoff = (cutoff - pd.DateOffset(months=HORIZON_MONTHS)).normalize() + pd.offsets.MonthEnd(0)
    train_cutoffs = [c for c in panel.cutoffs if c <= last_train_cutoff]

    X_train, y_train, _ = panel.for_cutoffs(train_cutoffs)
    X_test, y_test, _ = panel.for_cutoff(cutoff)

    # --- Baseline A ---
    yb = baseline_a_predict(X_test)
    rmse_b = mean_squared_error(y_test, yb) ** 0.5
    mae_b = mean_absolute_error(y_test, yb)

    # --- Tweedie (optional reference) ---
    tw_model = train_tweedie_model(X_train, y_train, power=POWER)
    y_tw = tweedie_predict(tw_model, X_test)
    rmse_tw = mean_squared_error(y_test, y_tw) ** 0.5
    mae_tw = mean_absolute_error(y_test, y_tw)
    dev_tw = mean_tweedie_deviance(y_test, np.maximum(y_tw, EPS_DEV), power=POWER)

    # --- XGBoost Poisson with early stopping on val=T-2mo and caching ---
    xgb_model, meta = train_or_load_xgb_poisson_for_cutoff(
        panel,
        cutoff,
        horizon_months=HORIZON_MONTHS,
        val_offset_months=XGB_VAL_OFFSET_MONTHS,
        max_depth=XGB_MAX_DEPTH,
        n_estimators_max=XGB_N_ESTIMATORS_MAX,
        early_stopping_rounds=XGB_EARLY_STOPPING_ROUNDS,
        model_dir=MODELS_DIR,
    )
    m = evaluate_xgb_poisson_on_cutoff(xgb_model, panel, cutoff)

    print("\n" + "=" * 90)
    print(f"CUTOFF={cutoff.date()} | horizon={HORIZON_MONTHS} | train_cutoffs={len(train_cutoffs)}")
    print(f"BaselineA      RMSE={rmse_b:.4f} | MAE={mae_b:.4f}")
    print(f"Tweedie(power={POWER}) RMSE={rmse_tw:.4f} | MAE={mae_tw:.4f} | dev={dev_tw:.4f}")
    print(
        f"XGBPoisson     RMSE={m['rmse']:.4f} | MAE={m['mae']:.4f} | "
        f"val={pd.Timestamp(meta['val_cutoff']).date()} | n_estimators={meta['n_estimators']}"
    )
    print(f"Saved model: {meta['saved_to']}")



CUTOFF=2019-01-31 | horizon=3 | train_cutoffs=16
BaselineA      RMSE=22.4675 | MAE=8.4606
Tweedie(power=1.5) RMSE=28.2593 | MAE=15.7937 | dev=10.8096
XGBPoisson     RMSE=17.0857 | MAE=7.1328 | val=2018-11-30 | n_estimators=23
Saved model: /mnt/c/Users/zaido/OneDrive/Bureau/Quod_THA/models/xgb_poisson_h3_cutoff=2019-01-31_val=2018-11-30.joblib


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights



CUTOFF=2019-03-31 | horizon=3 | train_cutoffs=18
BaselineA      RMSE=14.3557 | MAE=6.1181
Tweedie(power=1.5) RMSE=30.4362 | MAE=16.2750 | dev=11.4321
XGBPoisson     RMSE=16.1882 | MAE=7.9236 | val=2019-01-31 | n_estimators=16
Saved model: /mnt/c/Users/zaido/OneDrive/Bureau/Quod_THA/models/xgb_poisson_h3_cutoff=2019-03-31_val=2019-01-31.joblib

CUTOFF=2019-07-31 | horizon=3 | train_cutoffs=22
BaselineA      RMSE=13.1439 | MAE=5.1737
Tweedie(power=1.5) RMSE=28.0062 | MAE=15.3644 | dev=11.4096
XGBPoisson     RMSE=13.5263 | MAE=5.5828 | val=2019-05-31 | n_estimators=23
Saved model: /mnt/c/Users/zaido/OneDrive/Bureau/Quod_THA/models/xgb_poisson_h3_cutoff=2019-07-31_val=2019-05-31.joblib


  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights


In [17]:
# (moved) XGBoost training is now implemented in `src/builder/xgb_poisson.py` and exercised in the cell above.



In [18]:
# (moved) See the 2019 cutoff loop cell above; models are cached under `models/` per cutoff.
