In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/loans.csv',
                 usecols=[
                     'SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
                     'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
                     'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE'
                 ], index_col='SK_ID_CURR')
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.head()

In [None]:
df['TARGET'].value_counts(normalize=True)

## Feature Extraction / Transformations

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

y = df.pop('TARGET')
ct = ColumnTransformer(
    [('ohe', OneHotEncoder(), ['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']),
     ('scaler', StandardScaler(), ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE'])]
)

X = ct.fit_transform(df)

## Conventional Stratified Splitting

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, stratify=y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(xtrain, ytrain)

### Scoring on training data

In [None]:
roc_auc_score(ytrain, dt.predict(xtrain))

### Scoring on testing data

In [None]:
roc_auc_score(ytest, dt.predict(xtest))

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, stratify=y, shuffle=True)  # shuffle=True is default
dt.fit(xtrain, ytrain)
print('Train score:', roc_auc_score(ytrain, dt.predict(xtrain)))
print('Test score:', roc_auc_score(ytest, dt.predict(xtest)))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Let's do it ten times

In [None]:
train_scores = []
test_scores = []
for i in range(10):
    xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, stratify=y, shuffle=True)  # shuffle=True is default
    dt.fit(xtrain, ytrain)
    train_scores.append(roc_auc_score(ytrain, dt.predict(xtrain)))
    test_scores.append(roc_auc_score(ytest, dt.predict(xtest)))
plt.plot(train_scores, '-bo', label='Train ROC')
plt.plot(test_scores, '-go', label='Test ROC')
plt.legend()

## K-Fold Cross Validation

In [None]:
from IPython.display import Image
Image(filename='assets/kfold.png')

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=4)
train_scores = []
test_scores = []
for train_ix, test_ix in skf.split(X, y):
    xtrain, xtest, ytrain, ytest = X[train_ix], X[test_ix], y.iloc[train_ix], y.iloc[test_ix]
    dt.fit(xtrain, ytrain)
    train_scores.append(roc_auc_score(ytrain, dt.predict(xtrain)))
    test_scores.append(roc_auc_score(ytest, dt.predict(xtest)))
plt.plot(train_scores, '-bo', label='Train ROC')
plt.plot(test_scores, '-go', label='Test ROC')
plt.legend()

### Remember that we haven't touched the model yet!

In [None]:
from IPython.display import Image
Image(filename='assets/tree.png') 

## Grid Search

In [None]:
DecisionTreeClassifier?

In [None]:
grid = {
    'criterion': ['gini', 'entropy'],
    'splitter' : ['best', 'random'],
    'max_depth': [10, 20, 30, 40, 50],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': ['balanced', None]
}

In [None]:
from IPython.display import Image
Image(filename='assets/grid_search.png') 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gcv = GridSearchCV(dt, grid, scoring='roc_auc', cv=5, n_jobs=-1, verbose=2)

In [None]:
gcv.fit(X, y)

In [None]:
gcv.best_params_

In [None]:
dt_best = gcv.best_estimator_

In [None]:
skf = StratifiedKFold(n_splits=10)
train_scores = []
test_scores = []
for train_ix, test_ix in skf.split(X, y):
    xtrain, xtest, ytrain, ytest = X[train_ix], X[test_ix], y.iloc[train_ix], y.iloc[test_ix]
    dt_best.fit(xtrain, ytrain)
    train_scores.append(roc_auc_score(ytrain, dt_best.predict(xtrain)))
    test_scores.append(roc_auc_score(ytest, dt_best.predict(xtest)))
plt.plot(train_scores, '-.bo', label='Train ROC')
plt.plot(test_scores, '-.go', label='Test ROC')
plt.ylim(0, 1)
plt.legend()

In [None]:
from IPython.display import Image
Image(filename='assets/best-tree.png') 