In [1]:
from importlib import reload

In [2]:
import load_data

Loading up the load data and exploring the columns

In [3]:
loans = load_data.read_zipped_csv('./lending-club-loan-data.zip', 'loan.csv')
print(loans.columns)

  if self.run_code(code, result):


Index(['id', 'member_id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv',
       'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title',
       'emp_length', 'home_ownership', 'annual_inc', 'verification_status',
       'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose',
       'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs',
       'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq',
       'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal',
       'revol_util', 'total_acc', 'initial_list_status', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt',
       'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med',
       'mths_since_last_major_derog', 'policy_code', 'application_type',
       'annual_inc_joint', 'dti_joint', 'verification_status_joint',
    

Now, create a table model with all available columns.

In [None]:
import table_model
reload(table_model)

feature_columns = {
    'loan_amnt': table_model.NumericColumn(),
    'funded_amnt': table_model.NumericColumn(),
    'funded_amnt_inv': table_model.NumericColumn(),
    'term': table_model.CategoricalColumn(),
    'int_rate': table_model.PercentageColumn(),
    'installment': table_model.NumericColumn(),
    'grade': table_model.CategoricalColumn(),
    'sub_grade': table_model.CategoricalColumn(),
    'emp_length': table_model.CategoricalColumn(),
    'home_ownership': table_model.CategoricalColumn(),
    'annual_inc': table_model.NumericColumn(),
    'annual_inc_joint': table_model.NumericColumn(),
    'dti': table_model.NumericColumn(),
    'dti_joint': table_model.NumericColumn(),
    'open_acc': table_model.NumericColumn(),
    'total_acc': table_model.NumericColumn(),
    'pub_rec': table_model.NumericColumn(),
    'tot_coll_amt': table_model.NumericColumn(),
    'total_bal_il': table_model.NumericColumn(),
    'tot_cur_bal': table_model.NumericColumn(),
    'revol_bal': table_model.NumericColumn(),
    'revol_util': table_model.PercentageColumn(),
    'mths_since_last_delinq': table_model.NumericColumn(),
    'mths_since_last_record': table_model.NumericColumn(),
    'mths_since_last_major_derog': table_model.NumericColumn(),
    'inq_fi': table_model.NumericColumn(),
    'inq_last_12m': table_model.NumericColumn(),
    # and the output
    'loan_status': table_model.OutputLabelColumn(),
}

loan_model = table_model.TableModel(feature_columns, 'loan_status')
filtered = loans.loc[loans.loan_status.isin(['Current', 'Default'])]
loan_model.fit(filtered)
x, y = loan_model.transform(filtered)

In [None]:
print(x)

In [None]:
print(y)

In [None]:
import keras

model = keras.models.Sequential()
y_one_hot = keras.utils.to_categorical(y)
# logistic regression is a one layer model
model.add(keras.layers.Dense(y_one_hot.shape[1], activation='sigmoid', input_dim=x.shape[1]))
model.compile(optimizer='adam', loss='categorical_crossentropy')
model.fit(x, y_one_hot, epochs=2, verbose=1)

In [None]:
predictions = model.predict_classes(x)

In [None]:
import sklearn.metrics
print(sklearn.metrics.accuracy_score(y, predictions))

Looks good -- but there is probably a catch here, looking at the balance of classes, far more loans are 'Current' than 'Default', so our model probaly just learned to guess 'Current'.

In [None]:
import numpy as np
labels, counts = np.unique(y, return_counts=True)
print(labels, counts)

In [None]:
print(1 - (counts[1] / counts[0]))

So -- these highly imbalanced classes will make it hard to predict 'Default', let's see it that was the case.

In [None]:
print(sklearn.metrics.classification_report(y, predictions, target_names=loan_model.classes))

So -- clearly the model isn't actually very useful, even though it is accurate! 
Real data often needs to predict relatively rare events.

In [None]:
reload(table_model)

logistic_regression_model = table_model.KerasLogisticRegressionModel()
logistic_regression_model.fit(x, y)

In [None]:
balanced_predictions = logistic_regression_model.predict(x)

In [None]:
print(sklearn.metrics.accuracy_score(y, predictions))

In [None]:
print(np.unique(balanced_predictions, return_counts=True))

In [None]:
print(sklearn.metrics.classification_report(y, predictions, target_names=loan_model.classes))

Now - to try a deep learning model

In [None]:
deep_model = table_model.KerasDeepClassifierModel()
deep_model.fit(x, y)

In [None]:
deep_predictions = deep_model.predict(x)

In [None]:
print(sklearn.metrics.accuracy_score(y, deep_predictions))

In [None]:
print(np.unique(deep_predictions, return_counts=True))

In [None]:
print(sklearn.metrics.classification_report(y, deep_predictions, target_names=loan_model.classes))

And now -- grid search so see if we can fine tune

In [None]:
import sklearn.model_selection
hyperparameters = {
    'hidden': [32, 64, 128],
    'depth': [2, 4, 8]
}

grid = sklearn.model_selection.GridSearchCV(estimator=deep_model, param_grid=hyperparameters, cv=10, verbose=4)
grid_result = grid.fit(x, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
grid_predictions = grid.predict(x)

In [None]:
print(sklearn.metrics.accuracy_score(y, grid_predictions))

In [None]:
print(np.unique(grid_predictions, return_counts=True))

In [None]:
print(sklearn.metrics.classification_report(y, grid_predictions, target_names=loan_model.classes))