In [8]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from optimization import gp_optimize_threshold
from evaluation import train_and_test_other_models

In [9]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...


Feature Engineering...


Data shape: (42535, 40)


In [10]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [11]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()


<paramz.optimization.optimization.opt_lbfgsb at 0x1c3e56f3c88>

In [13]:
# Use X_val (validation set) to optimize threshold
y_hat, conf = gp_model.predict(X_val)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)

print(type(regressed_payment))
print(regressed_payment.shape)

# Optimize threshold for profits / profit_percentage
threshold = gp_optimize_threshold(gp_model, X_val, y_val, X_scaler, y_scaler, optimize_for="profit_percentage")
"""
print("\n---- Optimizing threshold using X_val ----")
print("Threshold: %f" % threshold)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=threshold)

# Test threshold value on test set
y_hat, conf = gp_model.predict(X_test)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)
print("\n----------- Testing on X_test ------------")
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=threshold)
"""
print(threshold)

<class 'numpy.ndarray'>
(500,)
1.15862234903


In [53]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

simulate_time_period(gp_model, X_val, y_val, X_scaler, y_scaler, threshold,
                     fund_given=1e5, num_months=10, incoming_loans_per_time_period=10,
                     optimize_for="TODO", version="threshold_only", model_type="gp")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Approved 15000.000000
Approved 30000.000000
Portfolio funds: 55000.000000
Approved 2300.000000
Approved 6000.000000
Approved 23000.000000
Portfolio funds: 25288.020000
Approved 6300.000000
Approved 10000.000000
Approved 2000.000000
Portfolio funds: 9388.680000
Approved 9000.000000
Portfolio funds: 3428.280000
Portfolio funds: 6748.850000
Approved 10000.000000
Portfolio funds: 69.420000
Approved 2300.000000
Portfolio funds: 1302.470000
Approved 2250.000000
Portfolio funds: 2661.600000
Portfolio funds: 6352.360000
Portfolio funds: 10043.120000


In [None]:
train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)