In [24]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from data import load_dataset, get_train_test_split
from optimization import gp_optimize_threshold
from evaluation import train_and_test_other_models
from kernels import get_predictions, get_optimized_model

In [25]:
filename = "dataset/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [26]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [38]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=10., lengthscale=10.)
gp_model = get_optimized_model(X_train, y_train_scaled, kernel)
print("Mean: %s" % gp_model.mean_function)
print("Kernel: %s" % gp_model.kern)

Mean: None
Kernel:   [1mrbf.       [0;0m  |          value  |  constraints  |  priors
  [1mvariance   [0;0m  |  0.63918383864  |      +ve      |        
  [1mlengthscale[0;0m  |  9.65771820014  |      +ve      |        


In [15]:
from simulation import *

%load_ext autoreload
%autoreload 2
%reload_ext autoreload

# simulate_time_period(gp_model, X_val, y_val, X_scaler, y_scaler, threshold,
#                      fund_given=1e5, num_months=10, incoming_loans_per_time_period=10,
#                      optimize_for="TODO", version="threshold_only", model_type="gp")

Approved 20000.000000
Approved 7500.000000
Approved 10000.000000
Approved 8000.000000
Portfolio funds: 54500.000000
Approved 35000.000000
Approved 12000.000000
Approved 5000.000000
Portfolio funds: 3816.630000
Portfolio funds: 6682.300000
Approved 7000.000000
Portfolio funds: 2547.970000
Portfolio funds: 5648.460000
Approved 2500.000000
Portfolio funds: 6248.950000
Portfolio funds: 9433.210000
Approved 10200.000000
Portfolio funds: 2417.470000
Portfolio funds: 5945.360000
Approved 2250.000000
Portfolio funds: 7223.250000


In [17]:
train_and_test_other_models(X_train, y_train, X_test, y_test, X_scaler)


-- Linear Regression --
Mean absolute error: 2188.281
R^2 Score:           0.840

-- Huber Regressor --
Mean absolute error: 1659.147
R^2 Score:           0.836

-- Linear SVM --
Mean absolute error: 1753.697
R^2 Score:           0.833

-- Poly SVM 2 --
Mean absolute error: 2054.468
R^2 Score:           0.825

-- Poly SVM 5 --
Mean absolute error: 2400.171
R^2 Score:           0.777

-- RBF SVM --
Mean absolute error: 1854.323
R^2 Score:           0.831

---- Threshold: 1.000000 ----
Loans approved:    423/500
Loans given:       $ 4501.4
Payments received: $ 5135.2

Profits:           $ 633.9
Profit Percentage: 14.1%
