In [294]:
from warnings import filterwarnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

np.set_printoptions(threshold=10000,suppress=True) 
filterwarnings('ignore')

CREDIT_SCORE = "../data/credit_scoring.csv"

# I) Apprentissage supervisé : Feature engineering et Classification

## **1) Chargement des données et préparation :**

In [295]:
from sklearn.model_selection import train_test_split


# chargement des données
pd_credit = pd.read_csv(CREDIT_SCORE, sep=";")
np_credit = pd_credit.values

# séparation entre donnée "input": X et les labels: Y
X, Y = np_credit[:, :-1], np_credit[:, -1]

# séparation des données en test et train
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.5, random_state=1)

# exploration des données
txt = f"""Le jeu de donnée présente:
    {X.shape[0]:<6d} enregistrements
    {X.shape[1]:<6d} variables explicatives
    {1:<6d} variable label/classification

Concernant la classification, elle est binaire (0 ou 1) avec:
    {int(Y.sum()):<6d} classe 1 = solvable
    {int(len(Y) - Y.sum()):<6d} classe 0 = non solvable 
"""
print(txt)

Le jeu de donnée présente:
    4375   enregistrements
    13     variables explicatives
    1      variable label/classification

Concernant la classification, elle est binaire (0 ou 1) avec:
    3159   classe 1 = solvable
    1216   classe 0 = non solvable 



## **2) Apprentissage et évaluation de modèles :**

In [296]:
from sklearn.metrics import (accuracy_score, confusion_matrix, precision_score,
                             recall_score)

COLS_EVAL = ["method", "comment", "precision", "accuracy", "recall", "VP", "VN", "FP", "FN", "args"]

def test_model(x_train, y_train, x_test, y_test, sk_fun, comment="", **kwargs):
    
    model = sk_fun(**kwargs)      # déclare le modèle 
    model.fit(x_train, y_train)     # trainning sur x_train et y_train
    y_pred = model.predict(x_test)    # testing sur x_test
    
    # métriques
    pr, acc, rec = precision_score(y_test, y_pred), accuracy_score(y_test, y_pred), recall_score(y_test, y_pred)
    Conf = confusion_matrix(y_test, y_pred)

    # maj df_eval
    line = [sk_fun.__name__, comment, pr, acc, rec, Conf[1, 1], Conf[0, 0], Conf[0, 1], Conf[1, 0], [str(kwargs)]]
    line_df = pd.DataFrame({k:v for k, v in zip(COLS_EVAL, line)}, columns=COLS_EVAL)
    
    return line_df, pd.DataFrame(Conf, index=["is 0", "is 1"], columns=["predicted 0", "predicted 1"])

comparative_df = pd.DataFrame(columns=COLS_EVAL)

### Méthode avec arbre CART (arbre de décision)

In [297]:

from sklearn.tree import DecisionTreeClassifier

newline, confuse = test_model(xtrain, ytrain, xtest, ytest,
                        DecisionTreeClassifier, random_state=1)
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)
                                
confuse

Unnamed: 0,predicted 0,predicted 1
is 0,325,279
is 1,318,1266


### Méthode k plus proches voisins (k-neighbors)

In [298]:
from sklearn.neighbors import KNeighborsClassifier

newline, confuse = test_model(xtrain, ytrain, xtest, ytest,
                                KNeighborsClassifier, n_neighbors=5)
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

confuse

Unnamed: 0,predicted 0,predicted 1
is 0,189,415
is 1,187,1397


### Méthode avec perceptron à deux couches de tailles respectives 40 et 20

In [299]:
from sklearn.neural_network import MLPClassifier

newline, confuse = test_model(xtrain, ytrain, xtest, ytest,
                                MLPClassifier, hidden_layer_sizes=(40, 20), random_state=1)
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

confuse

Unnamed: 0,predicted 0,predicted 1
is 0,361,243
is 1,482,1102


In [300]:
comparative_df.sort_values("precision", ascending=False)

Unnamed: 0,method,comment,precision,accuracy,recall,VP,VN,FP,FN,args
0,DecisionTreeClassifier,,0.819417,0.727148,0.799242,1266,325,279,318,{'random_state': 1}
2,MLPClassifier,,0.819331,0.668647,0.695707,1102,361,243,482,"{'hidden_layer_sizes': (40, 20), 'random_state..."
1,KNeighborsClassifier,,0.770971,0.724863,0.881944,1397,189,415,187,{'n_neighbors': 5}


On définit la **precision** (ou **positive predictive value**): $\frac{VP}{VP+FP}$ "Parmis les prédictions positives, lesquelles sont justes".

On définit l'**accuracy** : $\frac{VP+FN}{VP+FP+VN+FN}$ "Parmis l'ensemble des prédictions, lesquelles sont justes".

On définit le **recall** (ou **sensibilité** ou **true positive rate**): $\frac{VP}{VP+FN}$ "Parmis les cas postifs, lesquelles sont correctements détectés"

Dans notre situation, nous représentons l'assureur, nous voulons éviter les impayés ! il faut alors minimiser au possible les **faux positifs** (c'est à dire ne pas attribuer un prêt alors qu'un agent ne l'aurait pas fait). 
Ainsi le critère de **précision** est plus important que le recall puisque son calcul prend en compte les faux positifs (la précision augment à mesure que les FP diminue).

## 3) **Normalisation des variables continues**

In [301]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# init scaler std sur xtrain
stdScale=StandardScaler()
stdScale.fit(xtrain)

# application du scaler std
xtrain_ss = stdScale.transform(xtrain)
xtest_ss = stdScale.transform(xtest)

# init scaler minmax sur xtrain
mmScale= MinMaxScaler()
mmScale.fit(xtrain)

# application du scaler minmax
xtrain_mm = mmScale.transform(xtrain)
xtest_mm = mmScale.transform(xtest)

In [302]:
# CART tree + StdScale
newline, _ = test_model(xtrain_ss, ytrain, xtest_ss, ytest, 
                        DecisionTreeClassifier, random_state=1,
                        comment="StdScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

# CART tree + MinMaxScale
newline, _ = test_model(xtrain_mm, ytrain, xtest_mm, ytest, 
                        DecisionTreeClassifier, random_state=1, 
                        comment="MinMaxScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)


# K voisins + StdScale
newline, _ = test_model(xtrain_ss, ytrain, xtest_ss, ytest, 
                                KNeighborsClassifier, n_neighbors=5,
                                comment="StdScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

# K voisins + MinMaxScale
newline, _ = test_model(xtrain_mm, ytrain, xtest_mm, ytest, 
                                KNeighborsClassifier, n_neighbors=5, 
                                comment="MinMaxScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)


# perceptron multicouche 40-20 + StdScale
newline, _ = test_model(xtrain_ss, ytrain, xtest_ss, ytest, 
                                MLPClassifier, hidden_layer_sizes=(40, 20), random_state=1,
                                comment="StdScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

# perceptron multicouche 40-20 + MinMaxScale
newline, _ = test_model(xtrain_mm, ytrain, xtest_mm, ytest, 
                                MLPClassifier, hidden_layer_sizes=(40, 20), random_state=1, 
                                comment="MinMaxScale")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

In [303]:
comparative_df.sort_values("precision", ascending=False)

Unnamed: 0,method,comment,precision,accuracy,recall,VP,VN,FP,FN,args
7,MLPClassifier,StdScale,0.838172,0.776508,0.856692,1357,342,262,227,"{'hidden_layer_sizes': (40, 20), 'random_state..."
8,MLPClassifier,MinMaxScale,0.832346,0.788848,0.886995,1405,321,283,179,"{'hidden_layer_sizes': (40, 20), 'random_state..."
4,DecisionTreeClassifier,MinMaxScale,0.819767,0.728519,0.801136,1269,325,279,315,{'random_state': 1}
0,DecisionTreeClassifier,,0.819417,0.727148,0.799242,1266,325,279,318,{'random_state': 1}
2,MLPClassifier,,0.819331,0.668647,0.695707,1102,361,243,482,"{'hidden_layer_sizes': (40, 20), 'random_state..."
3,DecisionTreeClassifier,StdScale,0.818182,0.727148,0.801136,1269,322,282,315,{'random_state': 1}
5,KNeighborsClassifier,StdScale,0.810602,0.752742,0.859217,1361,286,318,223,{'n_neighbors': 5}
6,KNeighborsClassifier,MinMaxScale,0.807669,0.74543,0.85101,1348,283,321,236,{'n_neighbors': 5}
1,KNeighborsClassifier,,0.770971,0.724863,0.881944,1397,189,415,187,{'n_neighbors': 5}


# TEXT A REVOIR

best MLP + stdscale 
a l'excepetion de CART+stdscale, les scale ameliorent les res en precision

## 4) Création de nouvelles variables caractéristiques par combinaisons linéaires des variables initiales 

In [304]:
from sklearn.decomposition import PCA

# init PCA
pca = PCA()
pca.fit(xtrain_ss)

# PCA calc
xtrain_pca = pca.transform(xtrain_ss)
xtest_pca = pca.transform(xtest_ss)

# add 1, 2, 3 PCA axis to std scalled xtrain and xtest
xtrain_extend = np.column_stack((xtrain_ss, xtrain_pca[:, :3]))
xtest_extend = np.column_stack((xtest_ss, xtest_pca[:, :3]))

In [305]:
# CART tree + StdScale + PCA
newline, _ = test_model(xtrain_extend, ytrain, xtest_extend, ytest, 
                        DecisionTreeClassifier, random_state=1,
                        comment="StdScale + PCA")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)
# K voisins + StdScale + PCA
newline, _ = test_model(xtrain_extend, ytrain, xtest_extend, ytest, 
                                KNeighborsClassifier, n_neighbors=5,
                                comment="StdScale + PCA")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)
# perceptron multicouche 40-20 + StdScale + PCA
newline, _ = test_model(xtrain_extend, ytrain, xtest_extend, ytest, 
                                MLPClassifier, hidden_layer_sizes=(40, 20), random_state=1,
                                comment="StdScale + PCA")
comparative_df = pd.concat([comparative_df, newline], ignore_index=True)

In [306]:
comparative_df.sort_values("precision", ascending=False)

Unnamed: 0,method,comment,precision,accuracy,recall,VP,VN,FP,FN,args
7,MLPClassifier,StdScale,0.838172,0.776508,0.856692,1357,342,262,227,"{'hidden_layer_sizes': (40, 20), 'random_state..."
9,DecisionTreeClassifier,StdScale + PCA,0.832783,0.736289,0.795455,1260,351,253,324,{'random_state': 1}
8,MLPClassifier,MinMaxScale,0.832346,0.788848,0.886995,1405,321,283,179,"{'hidden_layer_sizes': (40, 20), 'random_state..."
11,MLPClassifier,StdScale + PCA,0.826981,0.77011,0.863005,1367,318,286,217,"{'hidden_layer_sizes': (40, 20), 'random_state..."
4,DecisionTreeClassifier,MinMaxScale,0.819767,0.728519,0.801136,1269,325,279,315,{'random_state': 1}
0,DecisionTreeClassifier,,0.819417,0.727148,0.799242,1266,325,279,318,{'random_state': 1}
2,MLPClassifier,,0.819331,0.668647,0.695707,1102,361,243,482,"{'hidden_layer_sizes': (40, 20), 'random_state..."
3,DecisionTreeClassifier,StdScale,0.818182,0.727148,0.801136,1269,322,282,315,{'random_state': 1}
5,KNeighborsClassifier,StdScale,0.810602,0.752742,0.859217,1361,286,318,223,{'n_neighbors': 5}
10,KNeighborsClassifier,StdScale + PCA,0.810396,0.756399,0.866162,1372,283,321,212,{'n_neighbors': 5}


# TEXT A REVOIR

## 5) 