# Loan Prediction Challenge: Modeling

In [None]:
# Header
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [None]:
# Imports
# The usual suspects ...
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Cleaning
from modeling.cleaning import convert_to_datetime
from modeling.cleaning import convert_to_category_type
from modeling.cleaning import label_encoding

# Pipeline functions
from modeling.classification import train_model
from modeling.classification import predict
from modeling.classification import prediction_metrics
from modeling.classification import plot_confusion_matrix
from modeling.classification import plot_results

# Preprocessing
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

# Hyper-parameter optimisation
from sklearn.model_selection import GridSearchCV

# Oversampling
from imblearn.over_sampling import ADASYN

# Utilities
from collections import Counter

## Pipeline Evaluation
### Preprocessing


In [None]:
# Import the data
df = pd.read_csv('../data/new_feature_matrix.csv')

In [None]:
# Drop `customerid`
df.drop(columns=['customerid', 'approveddate', 'creationdate'], inplace=True)

# Categorical
convert_to_category_type(df)

# Encoding
label_encoding(df)

assert df.isnull().sum().values.sum() == 0

In [None]:
# Feature-label split
X = df.drop(columns=['good_bad_flag'])
y = df['good_bad_flag']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42, shuffle=True)

### Modeling

In [None]:
# Models
names = ['Extra Trees',
         'Gradient Boosted',
         'Random Forest', 
         'Support Vector Machine']

# Instantiate models
models = [ExtraTreesClassifier(n_estimators=100, max_depth=5),
          GradientBoostingClassifier(n_estimators=100),
          RandomForestClassifier(n_estimators=100),
          SVC(C=.01, gamma='auto')]

# Target classes
classes = np.array(['Good', 'Bad'])

In [None]:
for m, name in zip(models, names):
    # Train
    model = train_model(X_train, y_train, m)
    # Predict
    predictions = predict(X_test, model)
    # Evaluate
    results = prediction_metrics(y_test, predictions, names, name)
    # Confusion matrix
    #plot_confusion_matrix(y_test, predictions, classes, title=name)

In [None]:
# Results
plot_results(results)

### ADASYN (Adaptive Synthetic)

In [None]:
# ADASYN resampling
X_resampled, y_resampled = ADASYN().fit_resample(X, y)
print(sorted(Counter(y_resampled).items()))

In [None]:
# Train-test split
aX_train, aX_test, ay_train, ay_test = train_test_split(X_resampled, y_resampled, test_size=.25)

In [None]:
for m, name in zip(models, names):
    # Train
    model = train_model(aX_train, ay_train, m)
    # Predict
    predictions = predict(X_test, model)
    # Evaluate
    results = prediction_metrics(y_test, predictions, names, name)
    # Confusion matrix
    #plot_confusion_matrix(y_test, predictions, classes, title=name)

In [None]:
# Results
plot_results(results)