420-A52-SF - Algorithmes d'apprentissage supervisé - Hiver 2020 - Spécialisation technique en Intelligence Artificielle<br/>
MIT License - Copyright (c) 2020 Mikaël Swawola
<br/>
![Travaux Pratiques - Arbres de décision](static/15-tp-banner.png)
<br/>
**Objectif:** cette séance de travaux pratiques a pour objectif la mise en oeuvre des arbres de régression et de classification. Les jeux de données utilisés seront **Hitters** (Baseball) et **Heart**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Exercice 1 - Arbres de régression

### 1-1 Chargement du jeu de données Hitters et exploration sommaire

In [None]:
import pandas as pd

In [None]:
HIT = pd.read_csv('../../data/Hitters.csv', index_col=[0])

In [None]:
HIT.head()

### 1-2 Suppression des observations ayant des valeurs manquantes

In [None]:
HIT = HIT.dropna()

### 1-3 Préparation de la structure de données

In [None]:
HIT_ind = pd.get_dummies(HIT, columns=['League','Division','NewLeague'], prefix = ['lg','div','nlg'], drop_first=True)
X = HIT_ind.drop(['Salary'], axis=1)
y = HIT['Salary']

### 1-4 Validation croisée Holdout 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=2020)

### 1-5 Mise en oeuvre de l'arbre de régression

In [None]:
from sklearn.tree import DecisionTreeRegressor

[class sklearn.tree.DecisionTreeRegressor(criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, presort='deprecated', ccp_alpha=0.0)](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

In [None]:
reg_tree = DecisionTreeRegressor(random_state=2020)
reg_tree.fit(X_train, y_train)

### 1-6 Évaluation du modèle

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mean_squared_error(y_train, reg_tree.predict(X_train), squared=False)

In [None]:
mean_squared_error(y_val, reg_tree.predict(X_val), squared=False)

### 1-7 Visualisation de l'arbre

In [None]:
from sklearn import tree

tree.plot_tree(reg_tree, filled=True, rounded=True, feature_names=X.columns)

### 1-8 Application de l'élagage et ré-évaluation du modèle

In [None]:
reg_tree = DecisionTreeRegressor(random_state=2020, ccp_alpha=20000)
reg_tree.fit(X_train, y_train)

In [None]:
mean_squared_error(y_train, reg_tree.predict(X_train), squared=False)

In [None]:
mean_squared_error(y_val, reg_tree.predict(X_val), squared=False)

### 1-9 Visualisation de l'arbre élagué

In [None]:
tree.plot_tree(reg_tree, filled=True, rounded=True, feature_names=X.columns)

## Exercice 2 - Arbres de classification

### 2-1 Chargement et préparation du jeu de données Heart

In [None]:
HRT = pd.read_csv('../../data/Heart.csv', index_col=[0])
HRT = HRT.dropna()

### 2-2 Préparation de la structure de données

In [None]:
HRT_ind = pd.get_dummies(HRT, columns=['ChestPain','Thal'], prefix = ['cp','thal'], drop_first=True)
X = HRT_ind.drop(['AHD'], axis=1)
y = (HRT['AHD'] == "Yes").astype(int)

### 2-3 Validation croisée Holdout 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=2020)

### 2-4 Mise en oeuvre de l'arbre de classification

In [None]:
from sklearn.tree import DecisionTreeClassifier

[class sklearn.tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
clf_tree = DecisionTreeClassifier(random_state=2020)
clf_tree.fit(X_train, y_train)

### 2-5 Évaluation du modèle

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
y_train_pred_proba_tree = clf_tree.predict_proba(X_train)[:,1]
y_val_pred_proba_tree = clf_tree.predict_proba(X_val)[:,1]

In [None]:
fpr_tree_val, tpr_tree_val, thresholds = roc_curve(y_val, y_val_pred_proba_tree)
fpr_tree_tr, tpr_tree_tr, thresholds = roc_curve(y_train, y_train_pred_proba_tree)

fig = plt.figure(4, figsize=(6, 6))

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_tree_val, tpr_tree_val, label='Validation')
plt.plot(fpr_tree_tr, tpr_tree_tr, label='Train')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend()

In [None]:
roc_auc_score(y_train, y_train_pred_proba_tree)

In [None]:
roc_auc_score(y_val, y_val_pred_proba_tree)

### 2-6 Application de l'élagage et ré-évaluation du modèle

In [None]:
clf_tree = DecisionTreeClassifier(random_state=2020, ccp_alpha=0.05)
clf_tree.fit(X_train, y_train)

In [None]:
y_train_pred_proba_tree = clf_tree.predict_proba(X_train)[:,1]
y_val_pred_proba_tree = clf_tree.predict_proba(X_val)[:,1]

In [None]:
fpr_tree_val, tpr_tree_val, thresholds = roc_curve(y_val, y_val_pred_proba_tree)
fpr_tree_tr, tpr_tree_tr, thresholds = roc_curve(y_train, y_train_pred_proba_tree)

fig = plt.figure(4, figsize=(6, 6))

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_tree_val, tpr_tree_val, label='Validation')
plt.plot(fpr_tree_tr, tpr_tree_tr, label='Train')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend()

In [None]:
roc_auc_score(y_train, y_train_pred_proba_tree)

In [None]:
roc_auc_score(y_val, y_val_pred_proba_tree)

### 2-7 Visualisation de l'arbre

In [None]:
from sklearn import tree

tree.plot_tree(clf_tree, filled=True, rounded=True, feature_names=X.columns, class_names=['Normal','AHD'])