In [1]:
import numpy as np
import pandas as pd

In [2]:
import seaborn as sns
import matplotlib.pylab as plt

In [3]:
import torch
import torch.nn.functional as F
import pyro

In [8]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm

In [4]:
from monteloanco import Model, Guide, GroupedBatchSampler, tmat_reshape, Template

### A deep state-space model for a consumer credit risk portfolio

This notebook outlines the development of a deep state-space model for consumer credit risk, built using [pyro.ai](https://pyro.ai/). At its core, the model employs Monte Carlo simulations for each loan, progressing through monthly timesteps. The hidden state at each step represents the loan’s status, with all accounts initially starting as current. From there, loans may transition to early payoff, arrears, or more commonly, remain current and advance to the next month.

The model requires 5 inputs: 
- `loan_amnt` the initial advance to the customer.
- `int_rate` the annual interest rate (as a percentage).
- `installment` the monthly payment according to the initial schedule.
- `total_pre_chargeoff` the total value of payments made against the account excluding recoveries.
- `num_timesteps` the number of months observed to date if training, or the desired length of the simulation.

The output used for validation is a simulation of hidden states (loan statuses) and payments, plus how those payments are attributed to principal and interest. Behind the scenes, the model also trains an embedding based on the loan account identifier, which effectively captures the performance characteristics of each specific loan. This embedding may serve several purposes, including:
- Simulating the performance of the existing portfolio.
- Extending the installment schedule to maturity to estimate the portfolio’s value if allowed to run off.
- Providing a low-dimensional representation of loan performance, enabling broader analysis beyond traditional good/bad account classifications for training applicant-level models.
- Reducing to a single risk dimension that represents the probability of default over any given time horizon.

We take a subset of the 2+ million accounts available here for speed.

In [5]:
df_train = pd.read_json('training.jsonl.gz', lines=True)
pd.testing.assert_index_equal(df_train.index, pd.RangeIndex(0, len(df_train)))

The model has been designed such that it can train / simulate a large number of accounts in parallel on a GPU. If you don't have a suitable GPU installed on your machine simply replace `cuda:0` here with `cpu`.

The data must be fed into the model in batches, where all sequences in a batch have the same length. We use the custom`GroupedBatchSampler` to define these batches.

In [7]:
device = 'cuda:0'
embedding_size = 3

In [9]:
batch_size = 100_000
dataset = df_train[['id', 'loan_amnt', 'int_rate', 'installment', 'n_report_d', 'total_pre_chargeoff']].to_dict(orient='records')
grouped_batch_sampler = GroupedBatchSampler(dataset, batch_size)

### Train the model

With the batches defined it's time to run the optimisation process, and tune the parameters. The loss here is the difference between the the total value of payments made on each account vs. those from the MC simulation.

In [10]:
%%time

# clear the param store in case we're in a REPL
pyro.clear_param_store()

# Initialize the model and guide
model = Model(embedding_size, device).to(device)
guide = Guide(device).to(device)

# Set up the optimizer and inference algorithm
optimizer = pyro.optim.Adam({"lr": 0.01})
svi = pyro.infer.SVI(model=model, guide=guide, optim=optimizer, loss=pyro.infer.Trace_ELBO())

# Run inference
#num_batches = grouped_batch_sampler.__len__()
num_iterations = 5_000
with tqdm(total=num_iterations, desc="Epochs", position=0) as epoch_pbar:
    for step in range(num_iterations):
    #with tqdm(total=num_batches, desc=f"Epoch {step + 1}", position=1, leave=False) as batch_pbar:
        losses = []
        for batch_id, batch in enumerate(DataLoader(dataset, batch_sampler=grouped_batch_sampler, num_workers=1)):
            losses.append(svi.step(
                batch_id=batch_id,
                batch_idx=torch.arange(len(batch['id'])).to(device), 
                installments=batch['installment'].to(device), 
                loan_amnt=batch['loan_amnt'].to(device), 
                int_rate=batch['int_rate'].to(device),
                total_pre_chargeoff=batch['total_pre_chargeoff'].to(device),
                num_timesteps=batch['n_report_d'].unique().item()))
            #batch_pbar.update(1)
        if step % np.ceil(num_iterations/100) == 0:
            print(f"Step {step} : Loss = {np.sum(losses)}")
        epoch_pbar.update(1)


Epochs:   0%|          | 0/5000 [00:00<?, ?it/s]

Step 0 : Loss = 236963754.91642624
Step 50 : Loss = 192751894.2455988
Step 100 : Loss = 161739093.68177637
Step 150 : Loss = 135718182.55855253
Step 200 : Loss = 113090497.49296676
Step 250 : Loss = 91331901.84729068
Step 300 : Loss = 74420162.71889558
Step 350 : Loss = 61824871.32401747
Step 400 : Loss = 51087492.47244795
Step 450 : Loss = 43997793.39645255
Step 500 : Loss = 40677793.904976904
Step 550 : Loss = 37262899.06724005
Step 600 : Loss = 33629424.19730986
Step 650 : Loss = 32274402.494653746
Step 700 : Loss = 29523214.26887231
Step 750 : Loss = 29835465.682323065
Step 800 : Loss = 28089517.179473046
Step 850 : Loss = 26732068.76644954
Step 900 : Loss = 25053745.5753332
Step 950 : Loss = 24046864.25321989
Step 1000 : Loss = 24835330.618126992
Step 1050 : Loss = 22814595.059687074
Step 1100 : Loss = 22674479.962866303
Step 1150 : Loss = 22386586.50549647
Step 1200 : Loss = 21530594.96972373
Step 1250 : Loss = 21898232.159778025
Step 1300 : Loss = 20976210.39945049
Step 1350 : L

### Save the model

Save model parameters to a file for inference in another notebook.

In [11]:
model.to(device)

Model()

In [12]:
pyro.get_param_store().save('param_store.pt')
torch.save(model.state_dict(), 'model_params.pt')
torch.save(guide.state_dict(), "guide_params.pt")