420-A52-SF - Algorithmes d'apprentissage supervisé - Hiver 2020 - Spécialisation technique en Intelligence Artificielle<br/>
MIT License - Copyright (c) 2020 Mikaël Swawola
<br/>
![Travaux Pratiques - Bagging, forêts aléatoires et boosting](static/16-tp-banner.png)
<br/>
**Objectif:** cette séance de travaux pratiques a pour objectif la mise en oeuvre des techniques suivantes:
* Bagging
* Forêts aléatoires
* Gradient Boosting
* AdaBoost
* XGBoost
* LightGBM

Le jeu de données utilisée sera **Heart**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Exercice 1 - Chargement et préparation des données

In [None]:
import pandas as pd

In [None]:
HRT = pd.read_csv('../../data/Heart.csv', index_col=[0])
HRT = HRT.dropna()

In [None]:
HRT_onehot = pd.get_dummies(HRT, columns=['ChestPain','Thal'], prefix = ['cp','thal'], drop_first=True)
X = HRT_onehot.drop(['AHD'], axis=1)
y = (HRT['AHD'] == "Yes").astype(int)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=2020)

## Exercice 2 - Arbres de classification (avec élagage)

In [None]:
from sklearn.tree import DecisionTreeClassifier

#### Définition du modèle et entraînement

In [None]:
clf_tree = DecisionTreeClassifier(random_state=2020, ccp_alpha=0.05)
clf_tree.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_tree = clf_tree.predict_proba(X_train)[:,1]
y_val_pred_proba_tree = clf_tree.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_tree)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_tree)}')

## Exercice 3 - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

[class sklearn.ensemble.BaggingClassifier(base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

#### Définition du modèle et entraînement

In [None]:
base_tree = DecisionTreeClassifier(random_state=2020, ccp_alpha=0.01)
clf_bag = BaggingClassifier(base_estimator=base_tree, n_estimators=1000, random_state=2020)
clf_bag.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_bag = clf_bag.predict_proba(X_train)[:,1]
y_val_pred_proba_bag = clf_bag.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_bag)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_bag)}')

## Exercice 4 - Forêts aléatoires

In [None]:
from sklearn.ensemble import RandomForestClassifier

[class sklearn.ensemble.RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

#### Définition du modèle et entraînement

In [None]:
clf_rf = RandomForestClassifier(random_state=2020, ccp_alpha=0.001)
clf_rf.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_rf = clf_rf.predict_proba(X_train)[:,1]
y_val_pred_proba_rf = clf_rf.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_rf)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_rf)}')

## Exercice 5 - AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

[class sklearn.ensemble.AdaBoostClassifier(base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)

#### Définition du modèle et entraînement

In [None]:
clf_ada = AdaBoostClassifier(n_estimators=100, random_state=2020)
clf_ada.fit(X_train, y_train)

#### Prédiction (train et val)

In [None]:
y_train_pred_proba_ada = clf_ada.predict_proba(X_train)[:,1]
y_val_pred_proba_ada = clf_ada.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_ada)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_ada)}')

## Exercice 6 - Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

[class sklearn.ensemble.GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, min_impurity_split=None, init=None, random_state=None, max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False, presort='deprecated', validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

#### Définition du modèle et entraînement

In [None]:
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=3, random_state=2020)
clf_gb.fit(X_train, y_train)

#### Prédiction (train et val)

In [None]:
y_train_pred_proba_gb = clf_gb.predict_proba(X_train)[:,1]
y_val_pred_proba_gb = clf_gb.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_gb)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_gb)}')

## Exercice 7 - XGBoost

In [None]:
#!pip install xgboost
import xgboost as xgb

[XGBoost Scikit-learn API](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn)

#### Définition du modèle et entraînement

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic',
                            colsample_bytree=0.3,
                            learning_rate=1.1,
                            max_depth=5,
                            reg_alpha=0.1,
                            n_estimators=100)
clf_xgb.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_xgb = clf_xgb.predict_proba(X_train)[:,1]
y_val_pred_proba_xgb = clf_xgb.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_xgb)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_xgb)}')

## Exercice 8 - LightGBM

In [None]:
!pip install lightgbm
import lightgbm as lgb

#### Définition du modèle et entraînement

In [None]:
clf_lgbm = lgb.LGBMClassifier(num_leaves=6, learning_rate=0.1, n_estimators=200)
clf_lgbm.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_lgbm = clf_lgbm.predict_proba(X_train)[:,1]
y_val_pred_proba_lgbm = clf_lgbm.predict_proba(X_val)[:,1]

#### Aire sous la courbe

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_lgbm)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_lgbm)}')

## Exercice 9 - Évaluation des modèles

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

In [None]:
fpr_tree, tpr_tree, thresholds = roc_curve(y_val, y_val_pred_proba_tree)
fpr_bag, tpr_bag, thresholds = roc_curve(y_val, y_val_pred_proba_bag)
fpr_rf, tpr_rf, thresholds = roc_curve(y_val, y_val_pred_proba_rf)
fpr_ada, tpr_ada, thresholds = roc_curve(y_val, y_val_pred_proba_ada)
fpr_gb, tpr_gb, thresholds = roc_curve(y_val, y_val_pred_proba_gb)
fpr_xgb, tpr_xgb, thresholds = roc_curve(y_val, y_val_pred_proba_xgb)
fpr_lgbm, tpr_lgbm, thresholds = roc_curve(y_val, y_val_pred_proba_lgbm)

fig = plt.figure(1, figsize=(12, 12))

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_tree, tpr_tree, label='Decision Tree')
plt.plot(fpr_bag, tpr_bag, label='Bagging')
plt.plot(fpr_rf, tpr_rf, label='Random Forest')
plt.plot(fpr_ada, tpr_ada, label='AdaBoost')
plt.plot(fpr_gb, tpr_gb, label='Gradient Boosting')
plt.plot(fpr_xgb, tpr_xgb, label='XGBoost')
plt.plot(fpr_lgbm, tpr_lgbm, label='LightGBM')


plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend()

## Exercice 10 - Importance des variables explicatives

In [None]:
imp = clf_xgb.feature_importances_

In [None]:
fig = plt.figure(2, figsize=(12, 12))
plt.barh(X.columns, imp)