420-A52-SF - Algorithmes d'apprentissage supervisé - Hiver 2020 - Spécialisation technique en Intelligence Artificielle<br/>
MIT License - Copyright (c) 2020 Mikaël Swawola
<br/>
![Travaux Pratiques - Ensembles](static/19-tp-banner.png)
<br/>
**Objectif:** cette séance de travaux pratiques a pour objectif la mise en oeuvre différentes techniques d'ensembles. Le jeu de données utilisée sera **Titanic**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Exercice 1 - Chargement et préparation des données

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('../../data/titanic_train.csv', index_col='PassengerId')

In [None]:
titanic.head()

In [None]:
titanic['Age'].isna().sum()

In [None]:
# Age
titanic['imp_age'] = titanic['Age'].isna()
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].mean())

In [None]:
# Embarked
titanic = pd.get_dummies(titanic, columns=['Embarked'], prefix = ['emb'], drop_first=True, dummy_na=True)

In [None]:
# Sex
titanic['Sex'] = (titanic['Sex'] == 'female').astype(int)

In [None]:
titanic.columns

In [None]:
X_train = titanic[['Age', 'Sex', 'Pclass', 'SibSp', 'Parch', 'Fare', 'emb_Q', 'emb_S', 'imp_age']]
y_train = titanic['Survived']

#### Vérification de la proportion des classes positives (Survided) et négatives (Died) 

In [None]:
y_train.sum()/len(y_train)

#### Importation de quelques librairies

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.utils.fixes import loguniform
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

#### Séparation des données

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.7, random_state=2020)

## Exercice 2 - Régression logistique

In [None]:
from sklearn.linear_model import LogisticRegression

[class sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
# Grille
parameters = {'C':[1, 2, 3],
              'l1_ratio':[0, 0.5, 1]}

# Régression logistique
clf_logreg = LogisticRegression(penalty='elasticnet',
                                max_iter=10000,
                                solver='saga',
                                random_state=2020,
                                n_jobs=-1)

# GridSearch avec Validation croisée
clf_logreg_grid = GridSearchCV(clf_logreg, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1)

# Ajustement sur échantillonnage du jeu d'entraînement
ratio = 0.5
Xs, ys = resample(X_train, y_train, n_samples = int(ratio*len(X_train)), stratify=y_train)
clf_logreg_grid.fit(Xs, ys)

In [None]:
clf_logreg_grid.best_score_

In [None]:
history = {}
history['LogReg'] = {'CV': clf_logreg_grid.best_score_}
history['LogReg']['CV']

## Exercice 3 - K plus proches voisins

In [None]:
from sklearn.neighbors import KNeighborsClassifier

[class sklearn.neighbors.KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
# Grille
parameters = {'n_neighbors':[1, 2, 4, 8, 16, 32, 64, 128],
              'p':[1, 2],
              'weights': ['uniform','distance']}

# K plus proches voisins
clf_knn = KNeighborsClassifier(n_jobs=-1)

# GridSearch avec Validation croisée
clf_knn_grid = GridSearchCV(clf_knn, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1)

# Ajustement sur échantillonnage du jeu d'entraînement
ratio = 0.5
Xs, ys = resample(X_train, y_train, n_samples = int(ratio*len(X_train)), stratify=y_train)
clf_knn_grid.fit(Xs, ys)

In [None]:
clf_knn_grid.best_score_

In [None]:
history['KNN'] = {'CV': clf_knn_grid.best_score_}
history['KNN']['CV']

## Exercice 4 - Arbres de décision

In [None]:
from sklearn.tree import DecisionTreeClassifier

[class sklearn.tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, class_weight=None, presort='deprecated', ccp_alpha=0.0)](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
# Distributions des paramètres
distributions = dict(
    criterion=['gini', 'entropy'],
    ccp_alpha=loguniform(1e-3, 1e3),
    max_depth=randint(2, 128))

# Estimateur
clf_tree = DecisionTreeClassifier(random_state=2020)
   
# Recherche aléatoire avec avec validation croisée
clf_tree_rnd = RandomizedSearchCV(clf_tree, distributions, n_iter=10000, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1, random_state=2020)

# Ajustement sur échantillonnage du jeu d'entraînement
ratio = 0.5
Xs, ys = resample(X_train, y_train, n_samples = int(ratio*len(X_train)), stratify=y_train)
clf_tree_rnd.fit(Xs, ys)

In [None]:
clf_tree_rnd.best_score_

In [None]:
history['Tree'] = {'CV': clf_tree_rnd.best_score_}
history['Tree']['CV']

## Exercice 5 - SVM

In [None]:
from sklearn.svm import SVC

[class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [None]:
# Grille
parameters = {'kernel':['linear', 'rbf']}

# K plus proches voisins
clf_svc = SVC(probability=True,
              random_state=2020)

# GridSearch avec Validation croisée
clf_svc_grid = GridSearchCV(clf_svc, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1)

# Ajustement sur échantillonnage du jeu d'entraînement
ratio = 0.5
Xs, ys = resample(X_train, y_train, n_samples = int(ratio*len(X_train)), stratify=y_train)
clf_svc_grid.fit(Xs, ys)

In [None]:
clf_svc_grid.best_score_

In [None]:
history['SVM'] = {'CV': clf_svc_grid.best_score_}
history['SVM']['CV']

In [None]:
history

## Exercice 6 - VotingClassifier

In [None]:
from sklearn.ensemble import VotingClassifier

[class sklearn.ensemble.VotingClassifier(estimators, voting='hard', weights=None, n_jobs=None, flatten_transform=True)](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html#sklearn.ensemble.VotingClassifier)

In [None]:
estimators=[
    ('lr', clf_logreg_grid.best_estimator_),
    ('knn', clf_knn_grid.best_estimator_),
    ('tree', clf_tree_rnd.best_estimator_),
    ('svc', clf_svc_grid.best_estimator_)]

clf_vote = VotingClassifier(estimators=estimators, voting='soft', n_jobs=-1)

clf_vote.fit(X_train, y_train)

In [None]:
cv_score = cross_val_score(clf_vote, X_train, y_train, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1)
cv_score.mean()

## Exercice 7 - Stacking

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression

In [None]:
X_stack = np.c_[
    clf_logreg_grid.best_estimator_.predict_proba(X_val)[:,1],
    clf_knn_grid.best_estimator_.predict_proba(X_val)[:,1],
    clf_tree_rnd.best_estimator_.predict_proba(X_val)[:,1],
    clf_svc_grid.best_estimator_.predict_proba(X_val)[:,1]
]

In [None]:
X_stack.shape

In [None]:
# Grille
parameters = {'C':[1, 2, 3],
              'l1_ratio':[0, 0.5, 1]}

# Régression logistique
clf_meta = LogisticRegression(penalty='elasticnet',
                                max_iter=10000,
                                solver='saga',
                                random_state=2020,
                                n_jobs=-1)

# GridSearch avec Validation croisée
clf_meta_grid = GridSearchCV(clf_meta, parameters, cv=5, scoring="roc_auc", verbose=1, n_jobs=-1)

clf_meta_grid.fit(X_stack, y_val)

In [None]:
clf_meta_grid.best_score_