In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from data import load_dataset, get_train_test_split

In [2]:
filename = "data/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [28]:
X_train, X_test, y_train, y_test = get_train_test_split(features)

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Normalize
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

X_train: (1000, 38), y_train: (1000,)
X_test: (500, 38), y_test: (500,)


In [83]:
def evaluate_model(model, X, y):
    pred = model.predict(X)
    score = model.score(X, y)
    mean_abs_err = np.mean(np.abs(pred - y))

    print("Mean absolute error: %.3f" % mean_abs_err)
    print("R^2 Score:           %.3f" % score)

def lend_using_target_only(model, X, y, scaler, threshold=1.0):
    """
    Simulate making loans with the trained model using only the target
    (the regressed total payment of the customer).
    If predicted total payment of customer is below (threshold * X[0]) 
    (where X[0] is the loan amount), reject making the loan.
    """
    loan_amount = scaler.inverse_transform(X)[:,0]
    satisfactory_payment = threshold * loan_amount
    regressed_total_payment = model.predict(X)
    loans_approved = regressed_total_payment > satisfactory_payment
    
    # Loaning to all
    loans_given_prev = np.sum(loan_amount[:]) / 1000
    payments_prev = np.sum(y) / 1000
    profits_prev = payments_prev - loans_given_prev
    profit_percentage_prev = profits_prev / loans_given_prev * 100
    
    # Loan according to model and threshold
    loans_given = np.sum(loan_amount[loans_approved]) / 1000
    payments = np.sum(y[loans_approved]) / 1000
    profits = payments - loans_given
    profit_percentage = profits / loans_given * 100
    
    print("\n--- Without model ---")
    print("Loans approved:    %d/%d" % (X.shape[0], X.shape[0]))
    print("Loans given:       $ %.1f" % loans_given_prev)
    print("Payments received: $ %.1f\n" % payments_prev)
    print("Profits:           $ %.1f" % (profits_prev))
    print("Profit Percentage: %.1f%%" % (profit_percentage_prev))
    
    print("\n---- With model ----")
    print("Loans approved:    %d/%d" % (np.sum(loans_approved), X.shape[0]))
    print("Loans given:       $ %.1f" % loans_given)
    print("Payments received: $ %.1f\n" % payments)
    print("Profits:           $ %.1f" % (profits))
    print("Profit Percentage: %.1f%%" % (profit_percentage))
    

In [31]:
# Regression models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, HuberRegressor

# Linear Regression
print("\n-- Linear Regression --")
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
evaluate_model(lin_reg, X_test, y_test)

# Huber Regressor
print("\n-- Huber Regressor --")
hub_reg = HuberRegressor(epsilon=1.)
hub_reg.fit(X_train, y_train)
evaluate_model(hub_reg, X_test, y_test)

# Linear SVM
print("\n-- Linear SVM --")
svm_lin = SVR(kernel="linear", C=1e3)
svm_lin.fit(X_train, y_train)
evaluate_model(svm_lin, X_test, y_test)

# Poly SVM
print("\n-- Poly SVM 2 --")
svm_poly_2 = SVR(kernel="poly", degree=2, C=1e5)
svm_poly_2.fit(X_train, y_train)
evaluate_model(svm_poly_2, X_test, y_test)

print("\n-- Poly SVM 5 --")
svm_poly_5 = SVR(kernel="poly", degree=5, C=1e8)
svm_poly_5.fit(X_train, y_train)
evaluate_model(svm_poly_5, X_test, y_test)

# RBF SVM
print("\n-- RBF SVM --")
svm_rbf = SVR(kernel="rbf", C=1e4)
svm_rbf.fit(X_train, y_train)
evaluate_model(svm_rbf, X_test, y_test)



-- Linear Regression --
Mean absolute error: 1989.449
R^2 Score:           0.856

-- Huber Regressor --
Mean absolute error: 1716.457
R^2 Score:           0.819

-- Linear SVM --
Mean absolute error: 1740.245
R^2 Score:           0.830

-- Poly SVM 2 --
Mean absolute error: 2020.491
R^2 Score:           0.821

-- Poly SVM 5 --
Mean absolute error: 2378.822
R^2 Score:           0.773

-- RBF SVM --
Mean absolute error: 1800.284
R^2 Score:           0.830


In [84]:
lend_using_target_only(lin_reg, X_test, y_test, scaler, threshold=1.0)


--- Without model ---
Loans approved:    500/500
Loans given:       $ 5408.2
Payments received: $ 5924.2

Profits:           $ 516.0
Profit Percentage: 9.5%

---- With model ----
Loans approved:    395/500
Loans given:       $ 4534.4
Payments received: $ 5179.5

Profits:           $ 645.1
Profit Percentage: 14.2%
