In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from data import load_dataset, get_train_test_split
from evaluation import evaluate_model, lend_using_target_only

In [2]:
filename = "data/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [3]:
X_train, X_test, y_train, y_test = get_train_test_split(features)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_test: (500, 38), y_test: (500,)


In [8]:
# Regression models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, HuberRegressor

# Linear Regression
print("\n-- Linear Regression --")
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
evaluate_model(lin_reg, X_test, y_test)

# Huber Regressor
print("\n-- Huber Regressor --")
hub_reg = HuberRegressor(epsilon=1.)
hub_reg.fit(X_train, y_train)
evaluate_model(hub_reg, X_test, y_test)

# Linear SVM
print("\n-- Linear SVM --")
svm_lin = SVR(kernel="linear", C=1e3)
svm_lin.fit(X_train, y_train)
evaluate_model(svm_lin, X_test, y_test)

# Poly SVM
print("\n-- Poly SVM 2 --")
svm_poly_2 = SVR(kernel="poly", degree=2, C=1e5)
svm_poly_2.fit(X_train, y_train)
evaluate_model(svm_poly_2, X_test, y_test)

print("\n-- Poly SVM 5 --")
svm_poly_5 = SVR(kernel="poly", degree=5, C=1e8)
svm_poly_5.fit(X_train, y_train)
evaluate_model(svm_poly_5, X_test, y_test)

# RBF SVM
print("\n-- RBF SVM --")
svm_rbf = SVR(kernel="rbf", C=1e4)
svm_rbf.fit(X_train, y_train)
evaluate_model(svm_rbf, X_test, y_test)



-- Linear Regression --
Mean absolute error: 1992.007
R^2 Score:           0.856

-- Huber Regressor --
Mean absolute error: 1716.457
R^2 Score:           0.819

-- Linear SVM --
Mean absolute error: 1740.245
R^2 Score:           0.830

-- Poly SVM 2 --
Mean absolute error: 2020.491
R^2 Score:           0.821

-- Poly SVM 5 --
Mean absolute error: 2378.822
R^2 Score:           0.773

-- RBF SVM --
Mean absolute error: 1800.284
R^2 Score:           0.830


In [5]:
# Simulate making loans only to those who can pay back threshold times of original amount
regressed_payment = lin_reg.predict(X_test)
lend_using_target_only(regressed_payment, X_test, y_test, scaler, threshold=1.0)


--- Without model ---
Loans approved:    500/500
Loans given:       $ 5408.2
Payments received: $ 5924.2

Profits:           $ 516.0
Profit Percentage: 9.5%

---- With model ----
Loans approved:    392/500
Loans given:       $ 4516.9
Payments received: $ 5159.8

Profits:           $ 642.9
Profit Percentage: 14.2%


In [6]:
# Gaussian Process
import GPy

# Normalize
y_scaler = MinMaxScaler()
y_scaler.fit(y_train.reshape(-1,1))
y_train_scaled = y_scaler.transform(y_train.reshape(-1,1))

# Initialize GP Model
kernel = GPy.kern.RBF(input_dim=X_train.shape[1], variance=1., lengthscale=1.)
gp_model = GPy.models.GPRegression(X_train, y_train_scaled, kernel)
gp_model.optimize()

y_hat, conf = gp_model.predict(X_test)

In [7]:
# Simulate making loans only to those who can pay back threshold times of original amount
regressed_payment = y_scaler.inverse_transform(y_hat).reshape(-1)
lend_using_target_only(regressed_payment, X_test, y_test, scaler, threshold=1.0)


--- Without model ---
Loans approved:    500/500
Loans given:       $ 5408.2
Payments received: $ 5924.2

Profits:           $ 516.0
Profit Percentage: 9.5%

---- With model ----
Loans approved:    387/500
Loans given:       $ 4387.3
Payments received: $ 5011.0

Profits:           $ 623.7
Profit Percentage: 14.2%
