420-A52-SF - Algorithmes d'apprentissage supervisé - Hiver 2020 - Spécialisation technique en Intelligence Artificielle<br/>
MIT License - Copyright (c) 2020 Mikaël Swawola
<br/>
![Travaux Pratiques - SVM](static/18-tp-banner.png)
<br/>
**Objectif:** cette séance de travaux pratiques a pour objectif la mise en oeuvre des machines à vecteurs de support (SVM). La recherche des meilleurs hyperparamètres sera du type recherche sur grille et le jeu de données utilisée sera **Heart**

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Exercice 1 - Chargement et préparation des données

In [None]:
import pandas as pd

In [None]:
HRT = pd.read_csv('../../data/Heart.csv', index_col=[0])
HRT = HRT.dropna()

In [None]:
HRT_onehot = pd.get_dummies(HRT, columns=['ChestPain','Thal'], prefix = ['cp','thal'], drop_first=True)
X = HRT_onehot.drop(['AHD'], axis=1)
y = (HRT['AHD'] == "Yes").astype(int)

In [None]:
y.sum()/len(y) # Les classes sont balancées !

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.7, random_state=2020)

## Exercice 2 - SVM

In [None]:
from sklearn.svm import SVC

[class sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

#### Définition du modèle et entraînement

In [None]:
clf_svc = SVC(random_state=2020)
clf_svc.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_svc = clf_svc.predict(X_train)
y_val_pred_svc = clf_svc.predict(X_val)

#### Accuracy et score F1

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
print(f'F1 Train = {f1_score(y_train, y_train_pred_svc)}')
print(f'F1 Val = {f1_score(y_val, y_val_pred_svc)}')

In [None]:
print(f'Accuracy Train = {accuracy_score(y_train, y_train_pred_svc)}')
print(f'Accuracy Val = {accuracy_score(y_val, y_val_pred_svc)}')

## Exercice 3 - GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

#### Définition du modèle et entraînement

In [None]:
3*3*2*5

In [None]:
# Grid
parameters = {'C':[0.01, 0.1, 1],
              'kernel':['linear', 'rbf', 'sigmoid'],
              'gamma': ['scale','auto']}

clf_svc = SVC(random_state=2020)

# GridSearch avec Validation croisée
clf_svc_grid = GridSearchCV(clf_svc, parameters, cv=5, scoring="f1", verbose=1, n_jobs=8)
clf_svc_grid.fit(X_train, y_train)

In [None]:
print(f'Meilleurs paramètres: {clf_svc_grid.best_params_}')
print(f'Meilleur score (F1 mean CV): {clf_svc_grid.best_score_}')

#### Prédictions (train et val)

In [None]:
y_train_pred_best_svc = clf_svc_grid.predict(X_train)
y_val_pred_best_svc = clf_svc_grid.predict(X_val)

#### Accuracy et score F1

In [None]:
print(f'F1 Train = {f1_score(y_train, y_train_pred_best_svc)}')
print(f'F1 Val = {f1_score(y_val, y_val_pred_best_svc)}')

In [None]:
print(f'Accuracy Train = {accuracy_score(y_train, y_train_pred_best_svc)}')
print(f'Accuracy Val = {accuracy_score(y_val, y_val_pred_best_svc)}')

## Exercice 4 - Probability=True

#### Définition du modèle et entraînement

In [None]:
clf_svc_proba = SVC(C=0.1, kernel='linear', gamma='scale', probability=True, random_state=2020)
clf_svc_proba.fit(X_train, y_train)

#### Prédictions (train et val)

In [None]:
y_train_pred_proba_best_svc = clf_svc_proba.predict_proba(X_train)[:,1]
y_val_pred_proba_best_svc = clf_svc_proba.predict_proba(X_val)[:,1]

#### ROC et AUC

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
print(f'AUC Train = {roc_auc_score(y_train, y_train_pred_proba_best_svc)}')
print(f'AUC Val = {roc_auc_score(y_val, y_val_pred_proba_best_svc)}')

In [None]:
fpr_svc, tpr_svc, thresholds = roc_curve(y_val, y_val_pred_proba_best_svc)

fig = plt.figure(1, figsize=(12, 12))

plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_svc, tpr_svc, label='SVM')

plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend()