## AI28 - Projet - Prédiction du niveau de revenus.

In [None]:
# pip install ucimlrepo

In [None]:
# OS
import sys


# WARNINGS
import warnings

from sklearn.ensemble import AdaBoostClassifier

warnings.filterwarnings('ignore')
warnings.filterwarnings('error', category=DeprecationWarning)

# NUMPY
import numpy as np

# STATS
import scipy.stats as stats
from scipy.stats import norm, skew

# MATPLOTLIB
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
# plt.style.use('fivethirtyeight') 

params = {'axes.labelsize': 8, # 12
          'font.size': 8, # 12
          'legend.fontsize': 8, # 12
          'xtick.labelsize': 8, # 10
          'ytick.labelsize': 8, # 10
          'text.usetex': True,
          'figure.figsize': (10, 8)}
plt.rcParams.update(params)


# PANDAS
import pandas as pd 
pd.set_option("display.max_rows", None, "display.max_columns", None) 

# SEABORN 
import seaborn as sns
# sns.set_context("poster")
# sns.set_style("ticks")

# SCHIKIT-LEARN: PRE-PROCESSING
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder # Encodage des variables catégorielles ordinales
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder # Encodage des variables catégorielles nominales
from sklearn.preprocessing import StandardScaler # Standardisation des variables numériques
from sklearn.preprocessing import MinMaxScaler # Normalisation MinMax
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer # Imputation
from sklearn.impute import KNNImputer 


# SCHIKIT-LEARN: MODELES
from sklearn import linear_model # Classe Modèle linéaire 
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso 
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier


# SCHIKIT-LEARN: VALIDATION CROISEE + OPTIMISATION
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score # Validation croisée pour comparaison entre modèles
from sklearn.model_selection import validation_curve # Courbe de validation: visualiser les scores lors du choix d'un hyperparamétre
from sklearn.model_selection import GridSearchCV # Tester plusieurs hyperparamètres
from sklearn.model_selection import RandomizedSearchCV # Tester plusieurs hyperparamètres
from sklearn.model_selection import learning_curve # Courbe d'apprentissage: visualisation des scores du train et du validation sets en fonction des quantités des données
import optuna
 
## EVALUATION

from sklearn.metrics import r2_score, accuracy_score, recall_score, precision_score  # Coefficient R2
from sklearn.metrics import mean_absolute_error #  MAE
from sklearn.metrics import mean_squared_error  # RMSE
from sklearn.metrics import median_absolute_error # Erreur de la médiane absolue

# SCHIKIT-LEARN: PIPELINE AND TRANSFORMATEUR
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

# SOURCE DATA
from ucimlrepo import fetch_ucirepo 

In [None]:
# Utilisation des commandes fournies par UCI

# Télécharger le jeu de données 
data_adult = fetch_ucirepo(id=2) 

In [None]:
# data (as pandas dataframes) 
x = data_adult.data.features # variables explicatives
y = data_adult.data.targets # variables cibles

In [None]:
# metadata 
print(data_adult.metadata) 

In [None]:
# variable information 
print(data_adult.variables) 

## Analyse exploratoire des données

Pour réaliser l'AED, nous utilisons les données des fichiers téléchargés depuis le site web.

In [None]:
df_adult = pd.read_csv("./Data/adult.data", header=None)

In [None]:
isinstance(df_adult, pd.DataFrame)

Le fichier ne contient pas les noms des colonnes. Nous les ajoutons dans le dataframe adult.

In [None]:
adult_columns = data_adult.variables['name']

df_adult.columns = adult_columns

df_adult.head()

In [None]:
df_adult.shape

In [None]:
df_adult.columns

In [None]:
df_adult.describe()

In [None]:
df_adult.isna().sum()

In [None]:
df_adult = df_adult.drop_duplicates()
df_adult[df_adult.duplicated()]

In [None]:
df_adult.info()

Problème : La description des données indique qu'elles contiennent des valeurs inconnues. Ces valeurs ne sont pas en NaN ou null. Nous pouvons déjà remarquer la présence de '?' en leur lieu et place.

## Traitement des données

In [None]:
df_adult = df_adult.replace('\?', np.nan, regex=True)
sns.heatmap(df_adult.isna(), cbar=False, cmap='viridis')
print(f"Nombre de valeurs manquantes : {df_adult.isna().sum()}")

In [None]:
# On remplace les données manquantes par le mode les plus frequentes (si elles depassent les 85%)
wrkclass_proportion = df_adult["workclass"].value_counts(dropna=False) / df_adult.shape[0]
print("workclass : ", wrkclass_proportion.idxmax(), f"{wrkclass_proportion.max() * 100:.2f}% ({len(wrkclass_proportion)} valeurs uniques)")
occupation_proportion = df_adult["occupation"].value_counts(dropna=False) / df_adult.shape[0]
print("occupation : ", occupation_proportion.idxmax(), f"{occupation_proportion.max() * 100:.2f}% ({len(occupation_proportion)} valeurs uniques)")
native_country_proportion = df_adult["native-country"].value_counts(dropna=False) / df_adult.shape[0]
print("native_country : ", native_country_proportion.idxmax(), f"{native_country_proportion.max() * 100:.2f}% ({len(native_country_proportion)} valeurs uniques)")
income_proportion = df_adult["income"].value_counts(dropna=False) / df_adult.shape[0]
print("income : ", income_proportion.idxmax(), f"{income_proportion.max() * 100:.2f}% ({len(income_proportion)} valeurs uniques)")

# Pour l'instant on remplit les valeurs manquantes par les valeurs les plus fréquentes.
for col in ["workclass", "occupation", "native-country"]:
    df_adult[col].fillna(df_adult[col].mode()[0], inplace=True)

In [None]:
categorical_columns = df_adult.select_dtypes(include=['object']).columns
ordinal_encoder = OrdinalEncoder()

df_adult_enco = df_adult.copy()
# On retire les colonnes inutiles
# print(df_adult_enco.columns)
# df_adult_enco = df_adult_enco.drop(columns=["education-num"])

# On encode les variables catégorielles ordinales
for col in categorical_columns:
    df_adult_enco[col] = ordinal_encoder.fit_transform(df_adult[[col]])

print(df_adult_enco.head())


> On pourrait normaliser les variables numériques, mais pour l'instant on ne le fait pas. On va se concentrer sur la création du modèle de prédiction.

## Modele de prediction

### Setup

> On pourrait faire une pipeline de pretraitement pour chaque pipeline (LabelEncoder, normalisation et drop des values NaN)

In [None]:
# Creation de différents sets
X_train, X_test, y_train, y_test = train_test_split(df_adult_enco.drop(columns=["income"]), df_adult_enco["income"], test_size=0.2, random_state=69)

# On normalise les variables quantitatives sur les données de train (StandardScaler pour l'instant)
numerical_columns = df_adult.select_dtypes(include=['int64', 'float64']).columns
numerical_scaler = StandardScaler()
numerical_scaler.fit(X_train[numerical_columns])
X_train[numerical_columns] = numerical_scaler.transform(X_train[numerical_columns])
X_test[numerical_columns] = numerical_scaler.transform(X_test[numerical_columns])

def eval_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f'=== Evaluation du modèle {model.__class__.__name__} ===')
    print(f"Accuracy : {accuracy_score(y_test, y_pred):.4f} (the higher the better)")
    print(f"Recall : {recall_score(y_test, y_pred):.4f} (the higher the better) (best for us)")
    print(f"R2 : {r2_score(y_test, y_pred):.4f} (the higher the better)")
    print(f"MAE : {mean_absolute_error(y_test, y_pred):.4f} (the lower the better)")
    print(f"RMSE : {mean_squared_error(y_test, y_pred):.4f} (the lower the better)")
    return y_pred

### Arbre de décision

In [None]:
from sklearn.tree import DecisionTreeClassifier

def opt_decision_tree(trial):
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    return DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion
    )

> Pas de tres bonnes performances, on va essayer avec un autre modèle.

### Random forest

In [None]:

def opt_random_forest(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 4)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 200)
    return RandomForestClassifier(
        max_depth=rf_max_depth,
        min_samples_split=rf_min_samples_split,
        min_samples_leaf=rf_min_samples_leaf,
        max_features=rf_max_features,
        criterion=rf_criterion,
        n_estimators=rf_n_estimators
      )

### Regression logistique

In [None]:
def opt_logistic_regression(trial):
    solver = trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'])

    # Filter penalties based on the solver
    penalty = "l2"  # Default penalty if an error occurs
    if solver in ['newton-cg', 'lbfgs', 'sag']:
        penalty = trial.suggest_categorical('penalty', ['l2', None])
    elif solver == 'liblinear':
        penalty = trial.suggest_categorical('penalty-liblinear', ['l1', 'l2'])
    elif solver == 'saga':
        penalty = trial.suggest_categorical('penalty-saga', ['l1', 'l2', 'elasticnet', None])

    C = trial.suggest_float('C', 0.01, 10.0, log=True)
    max_iter = trial.suggest_int('max_iter', 50, 500)
    return linear_model.LogisticRegression(
        penalty=penalty,
        C=C,
        solver=solver,
        max_iter=max_iter
    )

### AdaBoosting

In [None]:
def opt_adaboost(trial):
    base_estimator = trial.suggest_categorical('base_estimator', ['DecisionTree', 'RandomForest'])
    base_estimator_obj = None
    if base_estimator == 'DecisionTree':
        base_estimator_obj = opt_decision_tree(trial)
    elif base_estimator == 'RandomForest':
        base_estimator_obj = opt_random_forest(trial)

    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)

    return AdaBoostClassifier(
        estimator=base_estimator_obj,
        n_estimators=n_estimators,
        learning_rate=learning_rate
    )

### Gradient boosting

In [None]:
from xgboost import XGBClassifier

def opt_gradient_boosting(trial):
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    return XGBClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion,
        n_estimators=n_estimators,
        learning_rate=learning_rate
    )



### Stacking

In [None]:
from sklearn.ensemble import StackingClassifier

def opt_stacking(trial):
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
    rf_min_samples_split = trial.suggest_int('rf_min_samples_split', 2, 10)
    rf_min_samples_leaf = trial.suggest_int('rf_min_samples_leaf', 1, 4)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy'])
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 50, 200)

    xgb_max_depth = trial.suggest_int('xgb_max_depth', 2, 32)
    xgb_min_samples_split = trial.suggest_int('xgb_min_samples_split', 2, 10)
    xgb_min_samples_leaf = trial.suggest_int('xgb_min_samples_leaf', 1, 4)
    xgb_max_features = trial.suggest_categorical('xgb_max_features', ['sqrt', 'log2'])
    xgb_criterion = trial.suggest_categorical('xgb_criterion', ['gini', 'entropy'])
    xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 50, 200)

    return StackingClassifier(
        estimators=[
            ('rf', RandomForestClassifier(
                max_depth=rf_max_depth,
                min_samples_split=rf_min_samples_split,
                min_samples_leaf=rf_min_samples_leaf,
                max_features=rf_max_features,
                criterion=rf_criterion,
                n_estimators=rf_n_estimators
            )),
            ('xgb', XGBClassifier(
                max_depth=xgb_max_depth,
                min_samples_split=xgb_min_samples_split,
                min_samples_leaf=xgb_min_samples_leaf,
                max_features=xgb_max_features,
                criterion=xgb_criterion,
                n_estimators=xgb_n_estimators
            ))
        ],
        final_estimator=XGBClassifier(random_state=69)
    )

## Optimisation des hyperparamètres avec `optuna`

In [None]:
def objective(trial):
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    classifier_name = trial.suggest_categorical('classifier', ['DecisionTree', 'RandomForest', 'LogisticRegression', 'AdaBoosting', 'GradientBoosting'])#, 'Stacking'])
    # Create a classifier object based on the suggested hyperparameters.
    classifier_obj = None
    if classifier_name == 'DecisionTree':
        classifier_obj = opt_decision_tree(trial)
    elif classifier_name == 'RandomForest':
        classifier_obj = opt_random_forest(trial)
    elif classifier_name == 'LogisticRegression':
        classifier_obj = opt_logistic_regression(trial)
    elif classifier_name == 'AdaBoosting':
        classifier_obj = opt_adaboost(trial)
    elif classifier_name == 'GradientBoosting':
        classifier_obj = opt_gradient_boosting(trial)
    elif classifier_name == 'Stacking':
        classifier_obj = opt_stacking(trial)

    classifier_obj.fit(X_train, y_train)
    y_pred = classifier_obj.predict(X_test)
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    # return recall, precision
    return recall

# study = optuna.create_study(directions=["maximize", "maximize"])
# On tolère 1 chance sur 1.000.000 de se tromper
study = optuna.create_study(direction="maximize", study_name="Adult Income Prediction")
study.optimize(objective, n_trials=150)
print("Best hyperparameters: ", study.best_params)

best_model = None
if study.best_params['classifier'] == 'DecisionTree':
    best_model = opt_decision_tree(optuna.trial.FixedTrial(study.best_params))
elif study.best_params['classifier']  == 'RandomForest':
    best_model = opt_random_forest(optuna.trial.FixedTrial(study.best_params))
elif study.best_params['classifier']  == 'LogisticRegression':
    best_model = opt_logistic_regression(optuna.trial.FixedTrial(study.best_params))
elif study.best_params['classifier']  == 'AdaBoosting':
    best_model = opt_adaboost(optuna.trial.FixedTrial(study.best_params))
elif study.best_params['classifier']  == 'GradientBoosting':
    best_model = opt_gradient_boosting(optuna.trial.FixedTrial(study.best_params))
elif study.best_params['classifier']  == 'Stacking':
    best_model = opt_stacking(optuna.trial.FixedTrial(study.best_params))

best_model.fit(X_train, y_train)
eval_model(best_model, X_test, y_test)


In [None]:
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_adult_enco.drop(columns=["income"]), df_adult_enco["income"], test_size=0.2)
    # On normalise les variables quantitatives sur les données de train (StandardScaler pour l'instant)
    numerical_columns = df_adult.select_dtypes(include=['int64', 'float64']).columns
    numerical_scaler = StandardScaler()
    numerical_scaler.fit(X_train[numerical_columns])
    X_train[numerical_columns] = numerical_scaler.transform(X_train[numerical_columns])
    X_test[numerical_columns] = numerical_scaler.transform(X_test[numerical_columns])

    best_model.fit(X_train, y_train)
    print(f"\n=== Evaluation num {i+1} du modèle ===")
    eval_model(best_model, X_test, y_test)