In [1]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from data import load_dataset, get_train_test_split

In [2]:
filename = "data/LoanStats3a.csv"
features, data = load_dataset(filename)
print("Data shape: %s" % str(features.shape))

Preprocessing...
Feature Engineering...
Data shape: (42535, 40)


In [3]:
print(features[0:2,:])

X_train, X_test, y_train, y_test = get_train_test_split(features)

# Add bias terms
N_train, N_test = X_train.shape[0], X_test.shape[0]
X_train = np.hstack([np.ones([N_train, 1]), X_train])
X_test  = np.hstack([np.ones([N_test, 1]), X_test])

# Temporarily use subset of data to debug faster
# TODO: Remove
X_train, y_train = X_train[:1000,:], y_train[:1000]
X_test, y_test   = X_test[:500,:], y_test[:500]

# Scale to mean zero and unit variance
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

print("X_train: %s, y_train: %s" % (str(X_train.shape), str(y_train.shape)))
print("X_test: %s, y_test: %s" % (str(X_test.shape), str(y_test.shape)))

[[  5.00000000e+03   1.06500000e+01   1.62870000e+02   5.86332000e+03
    1.00000000e+01   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   2.40000000e+04   1.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   2.76500000e+01   0.00000000e+00   1.00000000e+00
    6.90775628e+00   6.90775628e+00   3.00000000e+00   0.00000000e+00
    9.00000000e+00   0.00000000e+00   0.00000000e+00   5.86315519e+03]
 [  2.50000000e+03   1.52700000e+01   5.98300000e+01   3.58980000e+03
    5.00000000e-01   1.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   3.00000000e+04   0.00000000e+00
    1.00000000e+00   0.00000000e+00   0.00000000e+00   1.00000000e+00
    0.00000000e+00 

In [6]:
def evaluate_model(model, X, y):
    pred = model.predict(X)
    score = model.score(X, y)
    mean_abs_err = np.mean(np.abs(pred - y))

    print("Mean absolute error: %.3f" % mean_abs_err)
    print("R^2 Score:           %.3f" % score)

In [42]:
# Regression models
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, HuberRegressor

# Linear Regression
print("\n-- Linear Regression --")
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
evaluate_model(lin_reg, X_test, y_test)

# Huber Regressor
print("\n-- Huber Regressor --")
hub_reg = HuberRegressor(epsilon=1.)
hub_reg.fit(X_train, y_train)
evaluate_model(hub_reg, X_test, y_test)

# Linear SVM
print("\n-- Linear SVM --")
svm_lin = SVR(kernel="linear", C=1e3)
svm_lin.fit(X_train, y_train)
evaluate_model(svm_lin, X_test, y_test)

# Poly SVM
print("\n-- Poly SVM 2 --")
svm_poly_2 = SVR(kernel="poly", degree=2, C=1e5)
svm_poly_2.fit(X_train, y_train)
evaluate_model(svm_poly_2, X_test, y_test)

print("\n-- Poly SVM 5 --")
svm_poly_5 = SVR(kernel="poly", degree=5, C=1e8)
svm_poly_5.fit(X_train, y_train)
evaluate_model(svm_poly_5, X_test, y_test)

# RBF SVM
print("\n-- RBF SVM --")
svm_rbf = SVR(kernel="rbf", C=1e4)
svm_rbf.fit(X_train, y_train)
evaluate_model(svm_rbf, X_test, y_test)



-- Linear Regression --
Mean absolute error: 1987.934
R^2 Score:           0.855

-- Huber Regressor --
Mean absolute error: 1716.457
R^2 Score:           0.819

-- Linear SVM --
Mean absolute error: 1740.245
R^2 Score:           0.830

-- Poly SVM 2 --
Mean absolute error: 2026.289
R^2 Score:           0.821

-- Poly SVM 5 --
Mean absolute error: 2396.635
R^2 Score:           0.772

-- RBF SVM --
Mean absolute error: 1801.072
R^2 Score:           0.830
