In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from optimization import gp_optimize_threshold
from evaluation import train_and_test_other_models

In [2]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [3]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [5]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()

<paramz.optimization.optimization.opt_lbfgsb at 0x102243e80>

In [6]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

np.random.seed(1)
perf = simulate_N_time_periods(gp_model, X_val, y_val, X_scaler, y_scaler, threshold=1.1, num_periods=30,
                               fund_given=1e5, num_months=10, incoming_loans_per_time_period=100,
                               optimize_for="TODO", version="threshold_only", model_type="gp")

mean_profits = np.mean(perf, axis=0)[:,0]
stddev_profits = np.std(perf, axis=0)[:,0]
lowerb_profits = mean_profits - stddev_profits
upperb_profits = mean_profits + stddev_profits

print(np.cumsum(lowerb_profits)[-1])
print(np.cumsum(mean_profits)[-1])
print(np.cumsum(upperb_profits)[-1])


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
-8198.96804571
12216.7635721
32632.4951899


In [7]:
# Use X_val (validation set) to optimize threshold
y_hat, conf = gp_model.predict(X_val)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)

# Optimize threshold for profits / profit_percentage
threshold = gp_optimize_threshold(gp_model, X_val, y_val, X_scaler, y_scaler, optimize_for="profit_percentage")
"""
print("\n---- Optimizing threshold using X_val ----")
print("Threshold: %f" % threshold)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=threshold)

# Test threshold value on test set
y_hat, conf = gp_model.predict(X_test)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)
print("\n----------- Testing on X_test ------------")
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=threshold)
"""
print(threshold)

1.15862234903


In [None]:
train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)