# Import des modules


In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
from sklearn.pipeline import Pipeline

#Selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance

#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MultiLabelBinarizer, MinMaxScaler

#Modèles
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

#Metriques
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


In [81]:
fc = pd.read_csv('fc_after_feature_engineering.csv')
print(fc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 40 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   a_quitte_l_entreprise                      1470 non-null   bool   
 1   age                                        1470 non-null   int64  
 2   annees_dans_l_entreprise                   1470 non-null   int64  
 3   annees_dans_le_poste_actuel                1470 non-null   int64  
 4   annees_depuis_la_derniere_promotion        1470 non-null   int64  
 5   annees_experience_totale                   1470 non-null   int64  
 6   annes_sous_responsable_actuel              1470 non-null   int64  
 7   augmentation_salaire_precedente            1470 non-null   int64  
 8   distance_domicile_travail                  1470 non-null   int64  
 9   domaine_etude_Entrepreunariat              1470 non-null   float64
 10  domaine_etude_Infra & Cl

# Séparation train test simple
- Des métriques d’évaluation calculées pour chaque modèle, sur le jeu d’apprentissage et le jeu de test.

In [82]:
#Un homme , Célibataire , Entre 30 et 40 ans , Consultant , Entre 1 et 7 années dans l’entreprise, Un revenu compris entre 2500€ et 6000€, Domaine d’etude Infra & Cloud, Qui a moins de 5 années sous son responsable actuel, Qui a une distance domicile travail entre 3 et 17km

columns_base = ['genre', 'statut_marital', 'age', 'annees_dans_l_entreprise', 'revenu_mensuel', 'distance_domicile_travail', 'satisfaction_globale']


columns_domaine_etude = [col for col in fc.columns if col.startswith('domaine_etude')]
columns_poste = [col for col in fc.columns if col.startswith('poste')]

all_columns = columns_base + columns_domaine_etude + columns_poste

X = fc[all_columns]
y = fc['a_quitte_l_entreprise']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=666)

# Validation croisée simple
- cross_validate

In [83]:
def perform_cross_validation(
    X: pd.DataFrame,
    y: pd.Series,
    model,
    cross_val_type, # La variante de validation croisée que nous souhaitons utiliser
    scoring_metrics: tuple, # Metriques de notre choix
    return_estimator=False, # Si nous souhaitons stocker les modèles de chaque fold
    groups=None, # Nous verrons l’utilité de cet argument juste après
):
    scores = cross_validate(
        model,
        X.to_numpy(),
        y.to_numpy(),
        cv=cross_val_type,
        return_train_score=True,
        return_estimator=return_estimator,
        scoring=scoring_metrics,
        groups=groups,
    )

    for metric in scoring_metrics:
        # la moyenne des scores (performance moyenne du modèle)
        print(
            "{metric} Train Average : {metric_value}".format(
                metric=metric,
                metric_value=round(np.mean(scores["train_" + metric]),2),
            )
        )
        # la standard deviation des scores (stabilité/variance du modèle)
        print(
            "{metric} Train Standard Deviation : {metric_value}".format(
                metric=metric, metric_value=round(np.std(scores["train_" + metric]),2)
            )
        )
        print(
            "{metric} Test Average : {metric_value}".format(
                metric=metric, metric_value=round(np.mean(scores["test_" + metric]),2)
            )
        )
        print(
            "{metric} Test Standard Deviation : {metric_value}".format(
                metric=metric, metric_value=round(np.std(scores["test_" + metric]),2)
            )
        )
        print("------")

    return scores

# Fonctions

In [84]:
list_model = []

def perform_model_class(model_name, model, X_train, y_train, X_test, y_test):

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")
        #F1-score : compromis entre précision et rappel.
        #average="macro" : moyenne simple entre classes (équilibre toutes les classes, même si elles sont rares).
        #average="weighted" : pondérée par le nombre d’exemples par classe.
        #average="micro" : global (utilise les VP/FP/FN de toutes les classes confondues).

        model_results = {
                'Model': model_name,
                'Accuracy': accuracy,
                'F1': f1
            }

        # Matrice de confusion
        print("Matrice de confusion :")
        print(confusion_matrix(y_test, y_pred))
        # Rapport complet (precision, recall, f1, support)
        print("\nRapport de classification :")
        print(classification_report(y_test, y_pred))

        list_model.append(model_results)

# Modele DUMMY
- DummyClassifier

In [85]:
# DummyClassifier (baseline)
print('############### MODELE DummyClassifier ################\n')
model_name = 'DummyClassifier'
model = DummyClassifier(strategy="most_frequent")
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

###################################

classification_scoring_metrics = ("accuracy", "recall_macro", "precision_macro", "f1_macro")

scores = perform_cross_validation(
        X=X,
        y=y,
        model=DummyClassifier(),
        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5
        scoring_metrics=classification_scoring_metrics,
    )


############### MODELE DummyClassifier ################

Matrice de confusion :
[[372   0]
 [ 69   0]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.84      1.00      0.92       372
        True       0.00      0.00      0.00        69

    accuracy                           0.84       441
   macro avg       0.42      0.50      0.46       441
weighted avg       0.71      0.84      0.77       441

accuracy Train Average : 0.84
accuracy Train Standard Deviation : 0.0
accuracy Test Average : 0.84
accuracy Test Standard Deviation : 0.02
------
recall_macro Train Average : 0.5
recall_macro Train Standard Deviation : 0.0
recall_macro Test Average : 0.5
recall_macro Test Standard Deviation : 0.0
------
precision_macro Train Average : 0.42
precision_macro Train Standard Deviation : 0.0
precision_macro Test Average : 0.42
precision_macro Test Standard Deviation : 0.01
------
f1_macro Train Average : 0.46
f1_macro Train Standard Deviatio

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


 # Modele LINEAIRE

# Modele NON LINEAIRE
- RandomForest, XGBoost ou CatBoost
- Métriques d’évaluation en classification : matrice de confusion, rappel et précision.
- Scores (présence d’overfit ou non, capacité d’éviter les faux positifs ou faux négatifs)

In [86]:
# RandomForest
print('############### MODELE RandomForestClassifier DEFAULT ################\n')
model_name = 'RandomForestClassifier'
model = RandomForestClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

###################################

classification_scoring_metrics = ("accuracy", "recall_macro", "precision_macro", "f1_macro")

scores = perform_cross_validation(
        X=X,
        y=y,
        model=RandomForestClassifier(),
        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5
        scoring_metrics=classification_scoring_metrics,
    )

############### MODELE RandomForestClassifier DEFAULT ################

Matrice de confusion :
[[365   7]
 [ 62   7]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.85      0.98      0.91       372
        True       0.50      0.10      0.17        69

    accuracy                           0.84       441
   macro avg       0.68      0.54      0.54       441
weighted avg       0.80      0.84      0.80       441

accuracy Train Average : 1.0
accuracy Train Standard Deviation : 0.0
accuracy Test Average : 0.84
accuracy Test Standard Deviation : 0.02
------
recall_macro Train Average : 1.0
recall_macro Train Standard Deviation : 0.0
recall_macro Test Average : 0.57
recall_macro Test Standard Deviation : 0.02
------
precision_macro Train Average : 1.0
precision_macro Train Standard Deviation : 0.0
precision_macro Test Average : 0.72
precision_macro Test Standard Deviation : 0.03
------
f1_macro Train Average : 1.0
f1_macro Train Sta

In [87]:
# XGBoost
print('############### MODELE XGBClassifier DEFAULT ################\n')
model_name = 'XGBClassifier'
model = XGBClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

###################################

classification_scoring_metrics = ("accuracy", "recall_macro", "precision_macro", "f1_macro")

scores = perform_cross_validation(
        X=X,
        y=y,
        model=XGBClassifier(),
        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5
        scoring_metrics=classification_scoring_metrics,
    )

############### MODELE XGBClassifier DEFAULT ################

Matrice de confusion :
[[356  16]
 [ 55  14]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.87      0.96      0.91       372
        True       0.47      0.20      0.28        69

    accuracy                           0.84       441
   macro avg       0.67      0.58      0.60       441
weighted avg       0.80      0.84      0.81       441

accuracy Train Average : 1.0
accuracy Train Standard Deviation : 0.0
accuracy Test Average : 0.83
accuracy Test Standard Deviation : 0.03
------
recall_macro Train Average : 1.0
recall_macro Train Standard Deviation : 0.0
recall_macro Test Average : 0.59
recall_macro Test Standard Deviation : 0.04
------
precision_macro Train Average : 1.0
precision_macro Train Standard Deviation : 0.0
precision_macro Test Average : 0.65
precision_macro Test Standard Deviation : 0.05
------
f1_macro Train Average : 1.0
f1_macro Train Standard Dev

In [88]:
# CatBoost
print('############### MODELE CatBoostClassifier DEFAULT ################\n')
model_name = 'CatBoostClassifier'
model = CatBoostClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

###################################

classification_scoring_metrics = ("accuracy", "recall_macro", "precision_macro", "f1_macro")

scores = perform_cross_validation(
        X=X,
        y=y,
        model=CatBoostClassifier(),
        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5
        scoring_metrics=classification_scoring_metrics,
    )


############### MODELE CatBoostClassifier DEFAULT ################

Learning rate set to 0.010429
0:	learn: 0.6858800	total: 1.62ms	remaining: 1.62s
1:	learn: 0.6787029	total: 2.07ms	remaining: 1.03s
2:	learn: 0.6715280	total: 2.54ms	remaining: 844ms
3:	learn: 0.6648268	total: 2.93ms	remaining: 730ms
4:	learn: 0.6574097	total: 3.3ms	remaining: 657ms
5:	learn: 0.6507738	total: 3.67ms	remaining: 607ms
6:	learn: 0.6444605	total: 4.16ms	remaining: 590ms
7:	learn: 0.6386157	total: 4.58ms	remaining: 568ms
8:	learn: 0.6328629	total: 5ms	remaining: 550ms
9:	learn: 0.6268734	total: 5.41ms	remaining: 535ms
10:	learn: 0.6207001	total: 5.8ms	remaining: 522ms
11:	learn: 0.6152192	total: 6.23ms	remaining: 513ms
12:	learn: 0.6101056	total: 6.62ms	remaining: 503ms
13:	learn: 0.6050583	total: 7.02ms	remaining: 494ms
14:	learn: 0.6007146	total: 7.28ms	remaining: 478ms
15:	learn: 0.5958048	total: 7.66ms	remaining: 471ms
16:	learn: 0.5910365	total: 8.12ms	remaining: 469ms
17:	learn: 0.5873010	total: 8.38m

# Amélioration de la classification
- demandez-vous si éviter des faux positifs est plus important qu’éviter des faux négatifs.