# Import des modules


In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import Counter
from sklearn.pipeline import Pipeline

#Selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance

#Preprocess
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MultiLabelBinarizer, MinMaxScaler

#Modèles
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

#Metriques
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report


In [33]:
fc = pd.read_csv('fc_after_feature_engineering.csv')
print(fc.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 39 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   a_quitte_l_entreprise                      1470 non-null   bool   
 1   age                                        1470 non-null   float64
 2   annee_experience_totale                    1470 non-null   int64  
 3   annees_dans_l_entreprise                   1470 non-null   int64  
 4   annees_dans_le_poste_actuel                1470 non-null   int64  
 5   annees_depuis_la_derniere_promotion        1470 non-null   int64  
 6   annes_sous_responsable_actuel              1470 non-null   int64  
 7   augementation_salaire_precedente           1470 non-null   int64  
 8   distance_domicile_travail                  1470 non-null   int64  
 9   domaine_etude_Entrepreunariat              1470 non-null   float64
 10  domaine_etude_Infra & Cl

# Séparation train test simple
- Des métriques d’évaluation calculées pour chaque modèle, sur le jeu d’apprentissage et le jeu de test.

In [34]:
#Un homme , Célibataire , Entre 30 et 40 ans , Consultant , Entre 1 et 7 années dans l’entreprise, Un revenu compris entre 2500€ et 6000€, Domaine d’etude Infra & Cloud, Qui a moins de 5 années sous son responsable actuel, Qui a une distance domicile travail entre 3 et 17km

columns_base = ['genre', 'statut_marital', 'age', 'annees_dans_l_entreprise', 'revenu_mensuel', 'annes_sous_responsable_actuel', 'distance_domicile_travail', 'niveau_hierarchique_poste', 'satisfaction_globale']
columns_domaine_etude = [col for col in fc.columns if col.startswith('domaine_etude')]
columns_poste = [col for col in fc.columns if col.startswith('poste')]

all_columns = columns_base + columns_domaine_etude + columns_poste

X = fc[all_columns]
y = fc['a_quitte_l_entreprise']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70, random_state=666)

# Validation croisée simple
- cross_validate

In [35]:
def perform_cross_validation(
    X: pd.DataFrame,
    y: pd.Series,
    model,
    cross_val_type, # La variante de validation croisée que nous souhaitons utiliser
    scoring_metrics: tuple, # Metriques de notre choix
    return_estimator=False, # Si nous souhaitons stocker les modèles de chaque fold
    groups=None, # Nous verrons l’utilité de cet argument juste après
):
    scores = cross_validate(
        model,
        X.to_numpy(),
        y.to_numpy(),
        cv=cross_val_type,
        return_train_score=True,
        return_estimator=return_estimator,
        scoring=scoring_metrics,
        groups=groups,
    )

    for metric in scoring_metrics:
        # la moyenne des scores (performance moyenne du modèle)
        print(
            "{metric} Train Average : {metric_value}".format(
                metric=metric,
                metric_value=round(np.mean(scores["train_" + metric]),2),
            )
        )
        # la standard deviation des scores (stabilité/variance du modèle)
        print(
            "{metric} Train Standard Deviation : {metric_value}".format(
                metric=metric, metric_value=round(np.std(scores["train_" + metric]),2)
            )
        )
        print(
            "{metric} Test Average : {metric_value}".format(
                metric=metric, metric_value=round(np.mean(scores["test_" + metric]),2)
            )
        )
        print(
            "{metric} Test Standard Deviation : {metric_value}".format(
                metric=metric, metric_value=round(np.std(scores["test_" + metric]),2)
            )
        )
        print("------")

    return scores

# Fonctions

In [36]:
list_model = []

def perform_model_class(model_name, model, X_train, y_train, X_test, y_test):

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="macro")
        #F1-score : compromis entre précision et rappel.
        #average="macro" : moyenne simple entre classes (équilibre toutes les classes, même si elles sont rares).
        #average="weighted" : pondérée par le nombre d’exemples par classe.
        #average="micro" : global (utilise les VP/FP/FN de toutes les classes confondues).

        model_results = {
                'Model': model_name,
                'Accuracy': accuracy,
                'F1': f1
            }

        # Matrice de confusion
        print("Matrice de confusion :")
        print(confusion_matrix(y_test, y_pred))
        # Rapport complet (precision, recall, f1, support)
        print("\nRapport de classification :")
        print(classification_report(y_test, y_pred))

        list_model.append(model_results)

# Modele DUMMY
- DummyClassifier

In [37]:
# DummyClassifier (baseline)
print('############### MODELE DummyClassifier ################\n')
model_name = 'DummyClassifier'
model = DummyClassifier(strategy="most_frequent")
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

###################################
'''
classification_scoring_metrics = ("accuracy", "f1_macro")

scores_DummyClassifier = perform_cross_validation(
        X=X,
        y=y,
        model=DummyClassifier(),
        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5
        scoring_metrics=classification_scoring_metrics,
    )
'''

############### MODELE DummyClassifier ################

Matrice de confusion :
[[372   0]
 [ 69   0]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.84      1.00      0.92       372
        True       0.00      0.00      0.00        69

    accuracy                           0.84       441
   macro avg       0.42      0.50      0.46       441
weighted avg       0.71      0.84      0.77       441



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


'\nclassification_scoring_metrics = ("accuracy", "f1_macro")\n\nscores_DummyClassifier = perform_cross_validation(\n        X=X,\n        y=y,\n        model=DummyClassifier(),\n        cross_val_type=KFold(n_splits=5, shuffle=True, random_state=666), #Par défaut, le nombre de folds est 5\n        scoring_metrics=classification_scoring_metrics,\n    )\n'

 # Modele LINEAIRE

# Modele NON LINEAIRE
- RandomForest, XGBoost ou CatBoost
- Métriques d’évaluation en classification : matrice de confusion, rappel et précision.
- Scores (présence d’overfit ou non, capacité d’éviter les faux positifs ou faux négatifs)

In [38]:
# RandomForest
print('############### MODELE RandomForestClassifier DEFAULT ################\n')
model_name = 'DummyClassifier'
model = RandomForestClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)


############### MODELE RandomForestClassifier DEFAULT ################

Matrice de confusion :
[[366   6]
 [ 55  14]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.87      0.98      0.92       372
        True       0.70      0.20      0.31        69

    accuracy                           0.86       441
   macro avg       0.78      0.59      0.62       441
weighted avg       0.84      0.86      0.83       441



In [39]:
# XGBoost
print('############### MODELE XGBClassifier DEFAULT ################\n')
model_name = 'XGBClassifier'
model = XGBClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)

############### MODELE XGBClassifier DEFAULT ################

Matrice de confusion :
[[361  11]
 [ 48  21]]

Rapport de classification :
              precision    recall  f1-score   support

       False       0.88      0.97      0.92       372
        True       0.66      0.30      0.42        69

    accuracy                           0.87       441
   macro avg       0.77      0.64      0.67       441
weighted avg       0.85      0.87      0.84       441



In [40]:
# CatBoost
print('############### MODELE CatBoostClassifier DEFAULT ################\n')
model_name = 'CatBoostClassifier'
model = CatBoostClassifier()
perform_model_class(model_name, model, X_train, y_train, X_test, y_test)


############### MODELE CatBoostClassifier DEFAULT ################

Learning rate set to 0.010429
0:	learn: 0.6856487	total: 1.13ms	remaining: 1.13s
1:	learn: 0.6786354	total: 1.7ms	remaining: 846ms
2:	learn: 0.6704579	total: 2.33ms	remaining: 774ms
3:	learn: 0.6635663	total: 2.95ms	remaining: 734ms
4:	learn: 0.6560514	total: 3.34ms	remaining: 664ms
5:	learn: 0.6496902	total: 3.82ms	remaining: 633ms
6:	learn: 0.6421048	total: 4.22ms	remaining: 599ms
7:	learn: 0.6355446	total: 4.73ms	remaining: 587ms
8:	learn: 0.6288505	total: 5.33ms	remaining: 587ms
9:	learn: 0.6226395	total: 5.78ms	remaining: 572ms
10:	learn: 0.6158402	total: 6.23ms	remaining: 560ms
11:	learn: 0.6104464	total: 6.6ms	remaining: 544ms
12:	learn: 0.6029325	total: 7.04ms	remaining: 534ms
13:	learn: 0.5966841	total: 7.46ms	remaining: 525ms
14:	learn: 0.5909789	total: 7.83ms	remaining: 514ms
15:	learn: 0.5855055	total: 8.24ms	remaining: 507ms
16:	learn: 0.5798243	total: 8.64ms	remaining: 499ms
17:	learn: 0.5745550	total: 9.

# Amélioration de la classification
- demandez-vous si éviter des faux positifs est plus important qu’éviter des faux négatifs.