In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from evaluation import train_and_test_other_models
from kernels import test_kernel

In [7]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...


Feature Engineering...


Data shape: (42535, 40)


In [8]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [9]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Input dimension for kernels
input_dim = X_train.shape[1]

Mean: None
Kernel:   [1mrbf.       [0;0m  |           value  |  constraints  |  priors
  [1mvariance   [0;0m  |  0.639194906688  |      +ve      |        
  [1mlengthscale[0;0m  |   9.65784744656  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    423/500
Loans given:       $ 4479.4
Payments received: $ 5096.7

Profits:           $ 617.3
Profit Percentage: 13.8%

---- Threshold: 0.983862 ----
Loans approved:    436/500
Loans given:       $ 4647.9
Payments received: $ 5283.8

Profits:           $ 635.9
Profit Percentage: 13.7%
0.983862380612


In [15]:
# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=input_dim, variance=1., lengthscale=1.)
test_kernel(kernel, X_train, y_train_scaled, X_val, y_val,
            X_test, y_test, X_scaler, y_scaler, optimize_for="profits")


Mean: None
Kernel:   [1mrbf.       [0;0m  |           value  |  constraints  |  priors
  [1mvariance   [0;0m  |  0.639194906688  |      +ve      |        
  [1mlengthscale[0;0m  |   9.65784744656  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    423/500
Loans given:       $ 4479.4
Payments received: $ 5096.7

Profits:           $ 617.3
Profit Percentage: 13.8%

---- Threshold: 0.983862 ----
Loans approved:    436/500
Loans given:       $ 4647.9
Payments received: $ 5283.8

Profits:           $ 635.9
Profit Percentage: 13.7%
0.983862380612


In [12]:
kernel = GPy.kern.sde_Matern32(input_dim, variance=1., lengthscale=1.)
test_kernel(kernel, X_train, y_train_scaled, X_val, y_val,
            X_test, y_test, X_scaler, y_scaler, optimize_for="profits")

Mean: None
Kernel:   [1mMat32.     [0;0m  |          value  |  constraints  |  priors
  [1mvariance   [0;0m  |  8.06316687872  |      +ve      |        
  [1mlengthscale[0;0m  |  59.1098118972  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    421/500
Loans given:       $ 4469.2
Payments received: $ 5084.4

Profits:           $ 615.2
Profit Percentage: 13.8%

---- Threshold: 0.981915 ----
Loans approved:    438/500
Loans given:       $ 4673.4
Payments received: $ 5300.2

Profits:           $ 626.8
Profit Percentage: 13.4%
0.981915078206


In [16]:
kernel = GPy.kern.sde_Matern52(input_dim, variance=1., lengthscale=1.)
test_kernel(kernel, X_train, y_train_scaled, X_val, y_val,
            X_test, y_test, X_scaler, y_scaler, optimize_for="profits")


38


Mean: None
Kernel:   [1mlinear.  [0;0m  |  value  |  constraints  |  priors
  [1mvariances[0;0m  |  (38,)  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    438/500
Loans given:       $ 4560.7
Payments received: $ 5205.3

Profits:           $ 644.6
Profit Percentage: 14.1%

---- Threshold: 0.996267 ----
Loans approved:    443/500
Loans given:       $ 4619.8
Payments received: $ 5255.4

Profits:           $ 635.7
Profit Percentage: 13.8%
0.996266874302


In [18]:
kernel = GPy.kern.Linear(input_dim, ARD=True)
test_kernel(kernel, X_train, y_train_scaled, X_val, y_val,
            X_test, y_test, X_scaler, y_scaler, optimize_for="profits")

38


Mean: None
Kernel:   [1mlinear.  [0;0m  |  value  |  constraints  |  priors
  [1mvariances[0;0m  |  (38,)  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    438/500
Loans given:       $ 4560.7
Payments received: $ 5205.3

Profits:           $ 644.6
Profit Percentage: 14.1%

---- Threshold: 0.996267 ----
Loans approved:    443/500
Loans given:       $ 4619.8
Payments received: $ 5255.4

Profits:           $ 635.7
Profit Percentage: 13.8%
0.996266874302


In [None]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# simulate_time_period(gp_model, X_val, y_val, X_scaler, y_scaler, threshold,
#                      fund_given=1e5, num_months=10, incoming_loans_per_time_period=10,
#                      optimize_for="TODO", version="threshold_only", model_type="gp")


38


Mean: None
Kernel:   [1mlinear.  [0;0m  |  value  |  constraints  |  priors
  [1mvariances[0;0m  |  (38,)  |      +ve      |        


----------- Testing on X_test ------------

---- Threshold: 1.000000 ----
Loans approved:    438/500
Loans given:       $ 4560.7
Payments received: $ 5205.3

Profits:           $ 644.6
Profit Percentage: 14.1%

---- Threshold: 0.996267 ----
Loans approved:    443/500
Loans given:       $ 4619.8
Payments received: $ 5255.4

Profits:           $ 635.7
Profit Percentage: 13.8%
0.996266874302


In [17]:
train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)


-- Linear Regression --


Mean absolute error: 2188.281
R^2 Score:           0.840

-- Huber Regressor --
Mean absolute error: 1659.147
R^2 Score:           0.836

-- Linear SVM --


Mean absolute error: 1753.697
R^2 Score:           0.833

-- Poly SVM 2 --
Mean absolute error: 2054.468
R^2 Score:           0.825

-- Poly SVM 5 --


Mean absolute error: 2400.171
R^2 Score:           0.777

-- RBF SVM --
Mean absolute error: 1854.323
R^2 Score:           0.831

---- Threshold: 1.000000 ----
Loans approved:    423/500
Loans given:       $ 4501.4
Payments received: $ 5135.2

Profits:           $ 633.9
Profit Percentage: 14.1%
