In [1]:
import numpy as np
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pylab as plt

In [3]:
import torch
import torch.nn.functional as F
import pyro

In [4]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

In [5]:
from monteloanco import model, guide, GroupedBatchSampler, TransitionMatrixNet

### A deep state-space model for a consumer credit risk portfolio

This notebook outlines the development of a deep state-space model for consumer credit risk, built using [pyro.ai](https://pyro.ai/). At its core, the model employs Monte Carlo simulations for each loan, progressing through monthly timesteps. The hidden state at each step represents the loan’s status, with all accounts initially starting as current. From there, loans may transition to early payoff, arrears, or more commonly, remain current and advance to the next month.

The model requires 5 inputs: 
- `loan_amnt` the initial advance to the customer.
- `int_rate` the annual interest rate (as a percentage).
- `installment` the monthly payment according to the initial schedule.
- `total_pre_chargeoff` the total value of payments made against the account excluding recoveries.
- `num_timesteps` the number of months observed to date if training, or the desired length of the simulation.

The output used for validation is a simulation of hidden states (loan statuses) and payments, plus how those payments are attributed to principal and interest. Behind the scenes, the model also trains an embedding based on the loan account identifier, which effectively captures the performance characteristics of each specific loan. This embedding may serve several purposes, including:
- Simulating the performance of the existing portfolio.
- Extending the installment schedule to maturity to estimate the portfolio’s value if allowed to run off.
- Providing a low-dimensional representation of loan performance, enabling broader analysis beyond traditional good/bad account classifications for training applicant-level models.
- Reducing to a single risk dimension that represents the probability of default over any given time horizon.

We take a subset of the 2+ million accounts available here for speed.

In [6]:
df_train = pd.read_json('training.jsonl.gz', lines=True)
pd.testing.assert_index_equal(df_train.index, pd.RangeIndex(0, len(df_train)))

The model has been designed such that it can train / simulate a large number of accounts in parallel on a GPU. If you don't have a suitable GPU installed on your machine simply replace `cuda:0` here with `cpu`.

The data must be fed into the model in batches, where all sequences in a batch have the same length. We use the custom`GroupedBatchSampler` to define these batches.

In [7]:
embedding_size = 3
device = 'cuda:0'

In [8]:
batch_size = 100_000
dataset = df_train[['id', 'loan_amnt', 'int_rate', 'installment', 'n_report_d', 'total_pre_chargeoff']].to_dict(orient='records')
grouped_batch_sampler = GroupedBatchSampler(dataset, batch_size)

### Train the model

With the batches defined it's time to run the optimisation process, and tune the parameters. The loss here is the difference between the the total value of payments made on each account vs. those from the MC simulation.

In [9]:
transition_net = TransitionMatrixNet(embedding_size, device=device).to(device)

In [10]:
%%time

# Clear the param store in case we're in a REPL
pyro.clear_param_store()

# Create partial functions with their respective parameters
from functools import partial

model_with_config = partial(model, 
                            embedding_size=embedding_size,
                            device=device, 
                            transition_net=transition_net)

guide_with_config = partial(guide, 
                            device=device)

# Set up the optimizer and inference algorithm
optimizer = pyro.optim.Adam({"lr": 0.01})
svi = pyro.infer.SVI(model=model_with_config, guide=guide_with_config, 
                     optim=optimizer, loss=pyro.infer.Trace_ELBO())

# Run inference
num_iterations = 5_000

with tqdm(total=num_iterations, desc="Epochs", position=0) as epoch_pbar:
    for step in range(num_iterations):
        losses = []
        for batch_id, batch in enumerate(DataLoader(dataset, batch_sampler=grouped_batch_sampler, num_workers=1)):
            loss = svi.step(
                batch_id=batch_id,
                batch_idx=torch.arange(len(batch['id'])).to(device), 
                installments=batch['installment'].to(device), 
                loan_amnt=batch['loan_amnt'].to(device), 
                int_rate=batch['int_rate'].to(device),
                total_pre_chargeoff=batch['total_pre_chargeoff'].to(device),
                num_timesteps=batch['n_report_d'].unique().item()
            )
            losses.append(loss)
            
        if step % np.ceil(num_iterations/100) == 0:
            print(f"Step {step} : Loss = {np.sum(losses)}")
        epoch_pbar.update(1)

Epochs:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 0 : Loss = 178973954.52637252
Step 50 : Loss = 165884450.95725507
Step 100 : Loss = 147935356.88372484
Step 150 : Loss = 130042577.39403778
Step 200 : Loss = 115917604.16053168
Step 250 : Loss = 108808760.92383014
Step 300 : Loss = 99997734.18320751
Step 350 : Loss = 97652751.18311207
Step 400 : Loss = 95596472.83802983
Step 450 : Loss = 90801084.32198785
Step 500 : Loss = 88055032.656288
Step 550 : Loss = 85157035.42829566
Step 600 : Loss = 84199837.82423073
Step 650 : Loss = 82577285.37509307
Step 700 : Loss = 80822230.63323434
Step 750 : Loss = 79271821.82688837
Step 800 : Loss = 78375398.55629423
Step 850 : Loss = 77687309.47699903
Step 900 : Loss = 76229973.46984458
Step 950 : Loss = 75410162.31318617
Step 1000 : Loss = 74642245.85624616
Step 1050 : Loss = 73876832.05166554
Step 1100 : Loss = 74006136.5971561
Step 1150 : Loss = 73021700.34380078
Step 1200 : Loss = 71672404.36228782
Step 1250 : Loss = 70905197.42338732
Step 1300 : Loss = 70885762.79535067
Step 1350 : Loss = 70

### Save the model

Save model parameters to a file for inference in another notebook.

In [11]:
pyro.get_param_store().save('param_store.pt')