### Credit Scoring Database - Model Construction

First we import the libraries we will need. In addition we will use the first code cell to activate the *inline* mode for the graphics generated by *matplotlib*. We also initialize the seed of the random generator.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(19)

## Data Load

In [None]:
# First we load thet database, and remove the first column
# (with label "Unnamed") which is just the row number
data = pd.read_csv('./datasets/give_me_some_credit/cs-training.csv').drop('Unnamed: 0', axis = 1)
cleanNames = []
for i in range(len(data.columns)):
    cleanNames.append(data.columns[i].replace('-', ''))
data.columns = cleanNames

data[:10]

## Data Description

In [None]:
description = pd.read_excel('./datasets/give_me_some_credit/Data Dictionary.xls')
pd.set_option('display.max_colwidth', 200)
description

In [None]:
data.describe()

## Data Exploration

In [None]:
class_column = 'SeriousDlqin2yrs'
classes_names = ['no financial distress', 'financial distress']
attribute_columns = list(data.columns)
attribute_columns.remove(class_column)

print(class_column)
print(attribute_columns)

In [None]:
def add_freq(data):
    ncount = len(data)

    ax2=ax.twinx()

    ax2.yaxis.tick_left()
    ax.yaxis.tick_right()

    ax.yaxis.set_label_position('right')
    ax2.yaxis.set_label_position('left')
    ax2.set_ylabel('Class Frequency (%)')

    for p in ax.patches:
        x=p.get_bbox().get_points()[:,0]
        y=p.get_bbox().get_points()[1,1]
        ax.annotate('{:.1f}%'.format(100.*y/ncount), (x.mean(), y), 
                ha='center', va='bottom')

    ax2.set_ylim(0,100)
    ax2.grid(None)


ax = sns.countplot(x = data.SeriousDlqin2yrs ,palette="Set3")
sns.set(font_scale=1.5)
ax.set_xlabel(' ')
ax.set_ylabel(' ')
fig = plt.gcf()
fig.set_size_inches(7,5)
ax.set_ylim(top=160000)
ax.set_xticklabels(["0 ({0})".format(classes_names[0]), "1 ({0})".format(classes_names[1])])
ax.set_title(class_column)

add_freq(data)

plt.show()

In [None]:
if True:
    fontsize = 10
    fig = data[attribute_columns].hist(bins = 20, figsize = (12, 12),
                                       xlabelsize=fontsize,
                                       ylabelsize=fontsize)
    for x in fig.ravel():
        x.title.set_size(fontsize)
else:
    nrows = (len(attribute_columns)+1) // 2
    plt.figure(figsize=(12,25))
    for i in range(len(attribute_columns)):
        name = attribute_columns[i]
        plt.subplot(nrows,2,i+1)
        data[name].hist()
        plt.title(name, fontsize=14)
    
plt.show()

### Outlier detection and processing

In [None]:
# We consider two alternative methods:

def n_stds_outlier_detector(x, threshold=3.):
    return np.abs(x - x.mean()) > threshold*x.std()

def percentile_outlier_detector(x, threshold=95.):
    diff = (100 - threshold) / 2.
    (minval, maxval) = np.percentile(x, [diff, 100. - diff])
    return ((x < minval) | (x > maxval))

In [None]:
data.isnull().sum()

In [None]:
clean_data = data.copy()
medians = data.median()
#clean_data.dropna(axis=0, inplace=True)
clean_data.fillna(medians, inplace = True)
clean_data.isnull().sum()

In [None]:
process_outliers = False

if process_outliers:
    outlier_detector = n_stds_outlier_detector
    outlier_detector = percentile_outlier_detector
    
    attributes_outliers_processing = attribute_columns.copy()
    attributes_outliers_processing.remove('NumberOfDependents')
    attributes_outliers_processing.remove('age')
    #attributes_outliers_processing.remove('NumberOfOpenCreditLinesAndLoans')
    #attributes_outliers_processing.remove('NumberRealEstateLoansOrLines')
    #print(attributes_outliers_processing)
    
    # Removing of any example that has at least one outlier value in a column
    for colname in attributes_outliers_processing:
        is_outlier = outlier_detector(clean_data[colname], threshold=98.)
        clean_data = clean_data[~is_outlier]

clean_data.describe()

In [None]:
ax = sns.countplot(x = data.SeriousDlqin2yrs ,palette="Set3")
sns.set(font_scale=1.5)
ax.set_xlabel(' ')
ax.set_ylabel(' ')
fig = plt.gcf()
fig.set_size_inches(7,5)
ax.set_ylim(top=160000)
ax.set_xticklabels(["0 ({0})".format(classes_names[0]), "1 ({0})".format(classes_names[1])])
ax.set_title(class_column)

add_freq(data)

plt.show()

In [None]:
if True:
    fontsize = 10
    fig = clean_data[attribute_columns].hist(bins = 20, figsize = (12, 12),
                                             xlabelsize=fontsize,
                                             ylabelsize=fontsize)
    for x in fig.ravel():
        x.title.set_size(fontsize)
else:
    nrows = (len(attribute_columns)+1) // 2
    plt.figure(figsize=(12,25))
    for i in range(len(attribute_columns)):
        name = attribute_columns[i]
        plt.subplot(nrows,2,i+1)
        clean_data[name].hist()
        plt.title(name, fontsize=14)
    
plt.show()

# Model construction

In [None]:
from my_library import train_val_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
#from graphviz import Source

In [None]:
# split data into training, validation and test
attributes_names = attribute_columns

y = np.array(clean_data[class_column])
X = np.array(clean_data[attribute_columns])

# Split all the data in training, validation and test sets
aux = train_val_test_split(X, y, val_size=.15, test_size=0.3, random_state=10)
X_train, X_val, X_test, y_train, y_val, y_test = aux

In [None]:
from libreria_aux_arboles import tree_to_code, tree_to_pseudo

clf = DecisionTreeClassifier(criterion='gini', max_depth=2,
                             min_samples_split=200, min_samples_leaf=100)

print(clf)

clf = clf.fit(X_train, y_train)

tree_to_code(clf, attributes_names)

#Source( export_graphviz(clf, out_file=None,
#                        feature_names=attributes_names,
#                        class_names=classes_names,
#                        filled=True, rounded=True,
#                        special_characters=True,
#                        impurity=False,
#                        leaves_parallel=True,
#                        rotate=False,
#                        node_ids=True))

In [None]:
1460/(58779+1460)

In [None]:
# Predictive quality of the model

from sklearn.metrics import confusion_matrix, classification_report

print("Score training = %f" % (clf.score(X_train, y_train)))
print("Score test = %f" % (clf.score(X_test, y_test)))

print("\nConfusion matrix in test:\n")
y_test_predicted = clf.predict(X_test)
confusion_matrix_test = confusion_matrix(y_test, y_test_predicted)
print(confusion_matrix_test) # row: real class; column: predicted class

# Plotting of the Confusion matrix:
plt.figure(figsize=(5, 5))
plt.imshow(confusion_matrix_test, interpolation='nearest', cmap=plt.cm.rainbow)
plt.title("Confusion matrix in test")
plt.colorbar()
tick_marks = classes_names
#plt.tight_layout()
plt.ylabel('True class')
plt.xlabel('Predicted class')
plt.show()

In [None]:
print(classification_report(y_test, y_test_predicted))

In [None]:
from sklearn.metrics import roc_curve, auc

target_class = "financial distress"


positive_class = classes_names.index(target_class)

y_train_proba_predict = clf.predict_proba(X_train)
y_test_proba_predict = clf.predict_proba(X_test)

positive_class_scores_train = y_train_proba_predict[:,positive_class]
positive_class_scores_test  = y_test_proba_predict[:,positive_class]
is_positive_class_train = y_train == positive_class
is_positive_class_test = y_test == positive_class

fpr, tpr, thresholds = roc_curve(is_positive_class_test,
                                 positive_class_scores_test,
                                 pos_label=1)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='model (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label = 'random')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC at test, target = '+'"'+target_class+'"')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Now we analyze the N cases with highest likelihood of being
# of positive class according to the model

N = 100
inds = y_test_proba_predict[:,positive_class].argsort()[::-1]
print("from the", N, "examples with more likelihood of being positive class")
print("according to the model,", (y_test[inds[:N]] == positive_class).sum(),
      "% have true positive class")

In [None]:
print(y_test_proba_predict[inds[0:10], positive_class])
print(y_test[inds[0:10]])

## Obtaining the set of rules equivalent to the tree

In [None]:
from my_library import get_rules_from_tree

# scores obtained from training set
rules = get_rules_from_tree(clf, attributes_names, classes_names, X_train, y_train)

target_class = 'financial distress'

# ordered by probability of target class
print(len(rules), "rules\n")
for item in sorted(rules.items(),
                   key=lambda r: r[1][1][classes_names.index(target_class)][2],
                   reverse=True):
    print(item, "\n")

In [None]:
# scores obtained from validation set
rules = get_rules_from_tree(clf, attributes_names, classes_names, X_val, y_val)

target_class = 'financial distress'

# ordered by probability of target class
print(len(rules), "rules\n")
for item in sorted(rules.items(),
                   key=lambda r: r[1][1][classes_names.index(target_class)][2],
                   reverse=True):
    print(item, "\n")

## Saving the tree and rules for future use

In [None]:
medians

In [None]:
import pickle
with open("credit_scoring_tree.b", 'wb') as f:
    pickle.dump(class_column, f)
    pickle.dump(classes_names, f)
    pickle.dump(attributes_names, f)
    pickle.dump(clf, f)
    pickle.dump(rules, f)
    medians.pop('SeriousDlqin2yrs')
    pickle.dump(medians, f)
    pickle.dump(process_outliers, f)