In [20]:
import pandas as pd
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
import torchmetrics as tm
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [5]:
# Load raw data
df = pd.read_csv("../azureml/data/lending_club.csv", delimiter=",")

# Create binary label
if "loan_status" in df:
    print("loan_status column exists, create binary label loan_repaid")
    df["loan_repaid"] = np.where(df["loan_status"] == "Fully Paid", 1, 0)
    df = df.drop(labels="loan_status", axis=1)
else:
    print("loan_status column does not exist, ignore")

# Drop features
df = df.drop(labels=["grade", "issue_d"], axis=1)

# Fill-in mort acc
mortmeans = df.groupby(by="total_acc")["mort_acc"].mean()


def myfill(total, mort):
    if pd.isna(mort):
        return mortmeans[total]
    else:
        return mort


df["mort_acc"] = df.apply(lambda x: myfill(x.total_acc, x.mort_acc), axis=1)

# Keep top 30 most frequent values for emp_title
top_values = (
    df["emp_title"]
    .groupby(df["emp_title"])
    .count()
    .sort_values(ascending=False)
    .head(30)
    .index.tolist()
)
df["emp_title"] = df["emp_title"].apply(lambda x: "Other" if x not in top_values else x)

# Keep top 30 most frequent values for title
top_values = (
    df["title"]
    .groupby(df["title"])
    .count()
    .sort_values(ascending=False)
    .head(30)
    .index.tolist()
)
df["title"] = df["title"].apply(lambda x: "Other" if x not in top_values else x)

# Drop rows with missing data
df = df.dropna()


# Convert term feature to number
def convert_term(x):
    if x == " 36 months":
        return 36
    if x == " 60 months":
        return 60


df["term"] = df["term"].apply(lambda x: convert_term(x))


# Home ownership feature - join categories
def homeownership(x):
    if (x == "NONE") or (x == "ANY"):
        return "OTHER"
    else:
        return x


df["home_ownership"].apply(lambda x: homeownership(x)).unique()

# Parse zip and create dummy vars
df["address"] = df["address"].str[-5:]

# Parse year
df["earliest_cr_year"] = pd.to_numeric(df["earliest_cr_line"].str[-4:])
df = df.drop(labels="earliest_cr_line", axis=1)

# Get dummies (OHE)
df = pd.get_dummies(
    columns=[
        "sub_grade",
        "verification_status",
        "application_type",
        "initial_list_status",
        "purpose",
        "home_ownership",
        "address",
        "emp_title",
        "title",
        "emp_length",
    ],
    data=df,
    drop_first=True,
)

loan_status column exists, create binary label loan_repaid


In [6]:
# Train test split
X = df.drop(labels="loan_repaid", axis=1)
y = df["loan_repaid"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [7]:
# Scale (note this is just for debug, we should first split test and train and fit just to train)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [8]:
# Create tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.from_numpy(y_train.to_numpy(dtype=np.float32)).reshape(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.from_numpy(y_test.to_numpy(dtype=np.float32)).reshape(-1, 1)

In [35]:
# Define model
class LandingClub(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(149, 75)
        self.layer2 = nn.Linear(75, 37)
        self.layer3 = nn.Linear(37, 18)
        self.layer4 = nn.Linear(18, 1)
        self.layer_norm = nn.LayerNorm(75)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.layer1(x)
        x = self.layer_norm(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = self.relu(x)
        x = self.layer4(x)
        x = self.sigmoid(x)
        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.forward(x)
        loss = nn.BCELoss()(y_pred, y)
        acc = tm.functional.accuracy(y_pred, y, task="binary")
        auc = tm.functional.auroc(y_pred, y.long(), task="binary")
        self.log('train_loss', loss)
        self.log('train_accuracy', acc)
        self.log('train_auc', auc)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self.forward(x)
        loss = nn.BCELoss()(y_pred, y)
        acc = tm.functional.accuracy(y_pred, y, task="binary")
        auc = tm.functional.auroc(y_pred, y.long(), task="binary")
        self.log('val_loss', loss)
        self.log('val_accuracy', acc)
        self.log('val_auc', auc)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

In [10]:
# Loaders
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1000, shuffle=True)
test_dataset = torch.utils.data.TensorDataset(X_test, y_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1000)

In [38]:
# Early stopping
early_stop_callback = EarlyStopping(monitor="val_auc", min_delta=0.00, patience=3, verbose=True, mode="max")

In [42]:
# Training
model = LandingClub()
trainer = pl.Trainer(max_epochs=10, callbacks=[early_stop_callback])
trainer.fit(model, train_loader, test_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type      | Params
-----------------------------------------
0 | layer1     | Linear    | 11.2 K
1 | layer2     | Linear    | 2.8 K 
2 | layer3     | Linear    | 684   
3 | layer4     | Linear    | 19    
4 | layer_norm | LayerNorm | 150   
5 | relu       | ReLU      | 0     
6 | sigmoid    | Sigmoid   | 0     
-----------------------------------------
14.9 K    Trainable params
0         Non-trainable params
14.9 K    Total params
0.060     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 302/302 [00:15<00:00, 19.73it/s, v_num=15]       

Monitored metric val_auc did not improve in the last 4 records. Best score: 0.906. Signaling Trainer to stop.


Epoch 0: 100%|██████████| 302/302 [00:15<00:00, 19.71it/s, v_num=15]


In [40]:
trainer.logged_metrics

{'train_loss': tensor(0.2404),
 'train_accuracy': tensor(0.9006),
 'train_auc': tensor(0.9158),
 'val_loss': tensor(0.2565),
 'val_accuracy': tensor(0.8883),
 'val_auc': tensor(0.9053)}

In [41]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/

Reusing TensorBoard on port 6006 (pid 28952), started 19:58:42 ago. (Use '!kill 28952' to kill it.)

In [None]:
# plt.plot(accuracies_val, label="accuracy_val")
# plt.axvline(
#     x=accuracies_val.index(max(accuracies_val)),
#     color="r",
#     linestyle="--",
#     label="max accuracy on test",
# )
# plt.title("Accuracy in validation")
# plt.xlabel("epochs")
# plt.legend()
# plt.show()

# plt.plot(np.array([x.item() for x in losses_val]), label="loss_val")
# plt.axvline(
#     x=accuracies_val.index(max(accuracies_val)),
#     color="r",
#     linestyle="--",
#     label="max accuracy on test",
# )
# plt.title("Loss in validation")
# plt.xlabel("epochs")
# plt.legend()
# plt.show()

# plt.plot(accuracies, label="accuracy_train")
# plt.axvline(
#     x=accuracies.index(max(accuracies)),
#     color="r",
#     linestyle="--",
#     label="max accuracy on train",
# )
# plt.xlabel("batches")
# plt.title("Accuracy in training")
# plt.legend()
# plt.show()

# plt.plot(np.array([x.item() for x in losses]), label="loss_train")
# plt.axvline(
#     x=accuracies.index(max(accuracies)),
#     color="r",
#     linestyle="--",
#     label="max accuracy on train",
# )
# plt.xlabel("batches")
# plt.title("Loss in training")
# plt.xlabel("epochs")
# plt.legend()
# plt.show()

In [None]:
# Save model where accuracies_val is max
torch.save(best_model, "lending_club_pytorch.pt")

NameError: name 'best_model' is not defined