In [2]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from optimization import gp_optimize_threshold
from data import load_dataset, get_train_test_split
from evaluation import evaluate_model, lend_using_target_only

In [3]:
filename = "data/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [4]:
X_train, X_test, y_train, y_test = get_train_test_split(features, test_size=0.3, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_val, y_val     = X_val[:500,:], y_val[:500]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
X_scaler = MinMaxScaler()
X_scaler.fit(X_train)
X_train = X_scaler.transform(X_train)
X_val = X_scaler.transform(X_val)
X_test = X_scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_val: %s, y_val: %s" % (str(X_val.shape), str(y_val.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_val: (500, 38), y_val: (500,)
X_test: (500, 38), y_test: (500,)


In [5]:
# Regression models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, HuberRegressor

# Linear Regression
print("\n-- Linear Regression --")
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
evaluate_model(lin_reg, X_test, y_test)

# Huber Regressor
print("\n-- Huber Regressor --")
hub_reg = HuberRegressor(epsilon=1.)
hub_reg.fit(X_train, y_train)
evaluate_model(hub_reg, X_test, y_test)

# Linear SVM
print("\n-- Linear SVM --")
svm_lin = SVR(kernel="linear", C=1e3)
svm_lin.fit(X_train, y_train)
evaluate_model(svm_lin, X_test, y_test)

# Poly SVM
print("\n-- Poly SVM 2 --")
svm_poly_2 = SVR(kernel="poly", degree=2, C=1e5)
svm_poly_2.fit(X_train, y_train)
evaluate_model(svm_poly_2, X_test, y_test)

print("\n-- Poly SVM 5 --")
svm_poly_5 = SVR(kernel="poly", degree=5, C=1e8)
svm_poly_5.fit(X_train, y_train)
evaluate_model(svm_poly_5, X_test, y_test)

# RBF SVM
print("\n-- RBF SVM --")
svm_rbf = SVR(kernel="rbf", C=1e4)
svm_rbf.fit(X_train, y_train)
evaluate_model(svm_rbf, X_test, y_test)



-- Linear Regression --
Mean absolute error: 2188.281
R^2 Score:           0.840

-- Huber Regressor --
Mean absolute error: 1659.147
R^2 Score:           0.836

-- Linear SVM --




Mean absolute error: 1753.697
R^2 Score:           0.833

-- Poly SVM 2 --
Mean absolute error: 2054.468
R^2 Score:           0.825

-- Poly SVM 5 --
Mean absolute error: 2400.171
R^2 Score:           0.777

-- RBF SVM --
Mean absolute error: 1854.323
R^2 Score:           0.831


In [6]:
# Simulate making loans only to those who can pay back threshold times of original amount
regressed_payment = lin_reg.predict(X_test)
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=1.0)


---- With model ----
Loans approved:    423/500
Loans given:       $ 4501.4
Payments received: $ 5135.2

Profits:           $ 633.9
Profit Percentage: 14.1%


633.87401227906503

In [7]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()


<paramz.optimization.optimization.opt_lbfgsb at 0x1087cedd8>

In [27]:
# Use X_val (validation set) to optimize threshold
y_hat, conf = gp_model.predict(X_val)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)

# Optimize threshold for profits / profit_percentage
threshold = gp_optimize_threshold(gp_model, X_val, y_val, X_scaler, y_scaler, optimize_for="profit_percentage")
print("\n---- Optimizing threshold using X_val ----")
print("Threshold: %f" % threshold)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_val, y_val, X_scaler, threshold=threshold)

# Test threshold value on test set
y_hat, conf = gp_model.predict(X_test)
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)
print("\n----------- Testing on X_test ------------")
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=1.0)
lend_using_target_only(regressed_payment, X_test, y_test, X_scaler, threshold=threshold)


---- Optimizing threshold using X_val ----
Threshold: 1.158622

---- With model ----
Loans approved:    416/500
Loans given:       $ 4452.8
Payments received: $ 4973.7

Profits:           $ 520.9
Profit Percentage: 11.7%

---- With model ----
Loans approved:    128/500
Loans given:       $ 941.6
Payments received: $ 1161.6

Profits:           $ 220.0
Profit Percentage: 23.4%

----------- Testing on X_test ------------

---- With model ----
Loans approved:    423/500
Loans given:       $ 4479.4
Payments received: $ 5096.7

Profits:           $ 617.3
Profit Percentage: 13.8%

---- With model ----
Loans approved:    137/500
Loans given:       $ 1017.6
Payments received: $ 1173.8

Profits:           $ 156.2
Profit Percentage: 15.3%


156.15602991533774