### Credit Scoring Database - Model Exploitation

First we import the libraries we will need. In addition we will use the first code cell to activate the *inline* mode for the graphics generated by *matplotlib*. We also initialize the seed of the random generator.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

np.random.seed(19)

## Data Load

In [None]:
# First we load thet database, and remove the first column
# (with label "Unnamed") which is just the row number

data = pd.read_csv('./datasets/give_me_some_credit/cs-exploitation.csv').drop('Unnamed: 0', axis = 1)
#data = pd.read_csv('datasets/give_me_some_credit/cs-training.csv')

cleanNames = []
for i in range(len(data.columns)):
    cleanNames.append(data.columns[i].replace('-', ''))
data.columns = cleanNames
data[:10]

## Data Description

In [None]:
data.describe()

## Loading the constructed model

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from graphviz import Source

In [None]:
import pickle
with open("credit_scoring_tree.b", 'rb') as f:
    class_column = pickle.load(f)
    classes_names = pickle.load(f)
    attribute_names = pickle.load(f)
    clf = pickle.load(f)
    rules = pickle.load(f)
    medians = pickle.load(f) # *****
    process_outliers = pickle.load(f) # *****

In [None]:
attribute_columns = attribute_names

print("* Class column                :", class_column, "\n")
print("* Classes names               :", classes_names, "\n")
print("* number of attrs             :", len(attribute_names), "\n")
print("* attribute names             :", attribute_names)
print("\n* medians in construction set :", medians)

In [None]:
from libreria_aux_arboles import tree_to_code, tree_to_pseudo
tree_to_code(clf, attribute_names)

#Source( export_graphviz(clf, out_file=None,
#                        feature_names=attribute_names,
#                        class_names=classes_names,
#                        filled=True, rounded=True,
#                        special_characters=True,
#                        impurity=False,
#                        leaves_parallel=True,
#                        rotate=False,
#                        node_ids=True))

In [None]:
target_class = 'financial distress'

# ordered by probability of target class
print(len(rules), "rules\n")
for item in sorted(rules.items(),
                   key=lambda r: r[1][1][classes_names.index(target_class)][2],
                   reverse=True):
    print(item, "\n")

## Data Cleaning

In [None]:
clean_data = data.copy()
#clean_data.dropna(axis=0, inplace=True)
# clean_data.fillna(data.median(), inplace = True) # No!
clean_data.fillna(medians, inplace = True) # Correct way
clean_data.isnull().sum()

In [None]:
# We consider two alternative methods:

def n_stds_outlier_detector(x, threshold=3.):
    return np.abs(x - x.mean()) > threshold*x.std()

def percentile_outlier_detector(x, threshold=95.):
    diff = (100 - threshold) / 2.
    (minval, maxval) = np.percentile(x, [diff, 100. - diff])
    return ((x < minval) | (x > maxval))

data.isnull().sum()

In [None]:
if process_outliers:
    outlier_detector = n_stds_outlier_detector
    outlier_detector = percentile_outlier_detector
    
    attributes_outliers_processing = attribute_columns.copy()
    attributes_outliers_processing.remove('NumberOfDependents')
    attributes_outliers_processing.remove('age')
    #attributes_outliers_processing.remove('NumberOfOpenCreditLinesAndLoans')
    #attributes_outliers_processing.remove('NumberRealEstateLoansOrLines')
    #print(attributes_outliers_processing)
    
    # Removing of any example that has at least one outlier value in a column
    for colname in attributes_outliers_processing:
        is_outlier = outlier_detector(clean_data[colname], threshold=98.)
        clean_data = clean_data[~is_outlier]

clean_data.describe()

## Predictions on the exploitation dataset

In [None]:
X_exploit = np.array(clean_data[attribute_names])
print("shape of X_exploit:", X_exploit.shape)

In [None]:
target_class = "financial distress"

aux = classes_names.index(target_class)
predicted_prob = clf.predict_proba(X_exploit)[:,aux]
classification_leaf = clf.apply(X_exploit)
scores = [rules[l][1][aux][2] for l in classification_leaf]

In [None]:
df = pd.DataFrame({'leaf': classification_leaf,
                   'predicted_prob': predicted_prob,
                   'score': scores})
df.head()

### Now we save the predictions made by the model

In [None]:
df.to_csv("credit_scoring_exploit_predictions.csv")