In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from lazypredict.Supervised import LazyClassifier
from sklearn.utils import all_estimators 
from sklearn.base import ClassifierMixin

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [2]:
def load_df(col_cible_type):
    df = pd.read_csv(
        f"../data/_df_ready_{col_cible_type}.csv", sep=";", low_memory=False
    )
    # Contrôle
    # display(df.head(3))
    # display(df.info())
    return df

In [None]:
import copy

cols_cible_type = ["TurnoutTimeSeconds", "TravelTimeSeconds", "PumpSecondsOnSite"]
cols_Data = [
    ["CalYear", "HourOfCall", "Postcode_district", "Month", "DayOfWeek"],
    ["CalYear", "HourOfCall", "Postcode_district", "Month", "DayOfWeek"],
    ["CalYear", "PropertyType", "StopCode"],
]
# utilise -1 sinon crée bizarrement des Nan pour les valeurs à 0 aua lieu de mettre 0
cols_cible_bins = [
    {
        "bins": np.array([-1, 1, 2, 3, 5, 8, 10, 15, 20000]) * 60,
        "labels": [1, 2, 3, 5, 8, 10, 15, 30],
    },
    {
        "bins": np.array([-1, 1, 2, 3, 5, 8, 10, 15, 20000]) * 60,
        "labels": [1, 2, 3, 5, 8, 10, 15, 30],
    },
    {
        "bins": np.array([-1, 5, 10, 15, 30, 45, 60, 120, 180, 360, 1000000]) * 60,
        "labels": [5, 10, 15, 30, 45, 60, 120, 180, 360, 1000],
    },
]
cols_cible = [
    ["TurnoutTimeSeconds_min", "TurnoutTimeSeconds_mean", "TurnoutTimeSeconds_max"],
    ["TravelTimeSeconds_min", "TravelTimeSeconds_mean", "TravelTimeSeconds_max"],
    ["PumpSecondsOnSite_min", "PumpSecondsOnSite_mean", "PumpSecondsOnSite_max"],
    # NumPumpsAttending??
]
cols_cible_filter = [
    "TurnoutTimeSeconds_mean",
    "TravelTimeSeconds_mean",
    "PumpSecondsOnSite_mean",
]

# copie profonde, sinon la simple copie fait une copie des références des ss tableaux, et leur modif modifie l'original
cols_cible_minutes = copy.deepcopy(cols_cible)

ignore_classifiers = [
    "CalibratedClassifierCV",
    "CalibratedClassifierCV",
    "CategoricalNB",
    "ClassifierChain",
    "ComplementNB",
    "FixedThresholdClassifier",
    "GaussianProcessClassifier",
    "GradientBoostingClassifier",
    "HistGradientBoostingClassifier",
    "LabelPropagation",
    "LabelSpreading",
    "LinearSVC",
    "LogisticRegressionCV",
    "SVC",
]

CLASSIFIERS = [
    est
    for est in all_estimators()
    if (issubclass(est[1], ClassifierMixin) and (not (est[0] in ignore_classifiers)))
]

all_results = pd.DataFrame()


for index, name in enumerate(cols_cible_type):
    df = load_df(name)
    # Crée 3 nouvelles target pour chque min/mean/max, par tranche en minutes, pour la cible type actuelle
    for index_cible, col_cible in enumerate(cols_cible[index]):
        bins = cols_cible_bins[index]["bins"]
        print(bins)
        labels = cols_cible_bins[index]["labels"]
        print(labels)
        new_name = col_cible.replace("Seconds", "Minutes")
        print(new_name, col_cible)
        df[new_name] = pd.cut(x=df[col_cible], bins=bins, labels=labels)
        cols_cible_minutes[index][index_cible] = new_name
        print("cols_cible", cols_cible)
        print("cols_cible_minutes", cols_cible_minutes)
    display(df.head(10))

    # df = df[df.CalYear > 6]

    # pd.DataFrame(pd.cut(X['Age'], bins = [0, 12, 18, 30, 50, 65, np.max(df.Age)], labels=['Kid','Adolescent','Adult-','Adult','Adult+','Senior']))
    # df['age_cat'] = pd.cut(x = df['age'], bins = [19,24,28,32,36,40], labels = ['19-24', '25-28', '29-32', '33-36', '37-40'],
    #                    include_lowest = True)

    # X = df[["PumpSecondsOnSite_min", "PumpSecondsOnSite_mean", "PumpSecondsOnSite_max", "TurnoutTimeSeconds_min", "TurnoutTimeSeconds_mean",
    #               "TurnoutTimeSeconds_max", "TravelTimeSeconds_min", "TravelTimeSeconds_mean", "TravelTimeSeconds_max", "NumPumpsAttending"]]

    # plt.boxplot([df.TurnoutTimeSeconds_min, df.TurnoutTimeSeconds_mean, df.TurnoutTimeSeconds_max])
    # plt.show()
    # plt.boxplot([df.TravelTimeSeconds_min, df.TravelTimeSeconds_mean, df.TravelTimeSeconds_max])
    # plt.show()
    # plt.figure(figsize=(5, 10))
    # plt.boxplot([df.PumpSecondsOnSite_min, df.PumpSecondsOnSite_mean, df.PumpSecondsOnSite_max])
    # plt.show()

    # Auto ML
    df_limited = df[df.CalYear > 6]
    print(df_limited.shape)
    # Crée X sans les target
    cols_to_remove = (
        [item for sublist in cols_cible for item in sublist]
        + cols_cible_minutes[index]
        + ["NumPumpsAttending"]
    )
    print("cols_to_remove", cols_to_remove)
    X = df_limited.drop(cols_to_remove, axis=1)
    # Ne conserve que certaines colonnes explicatives
    # Crée une liste des colonnes à conserver
    cols_to_keep = [
        col
        for col in X.columns
        if any(substring in col for substring in cols_Data[index])
    ]
    print("cols_to_keep", cols_to_keep)
    # Conserver uniquement ces colonnes
    X = X[cols_to_keep]

    display(X.head(2))
    print(X.shape)

    for index_cible, col_cible in enumerate(cols_cible[index]):
        if not (col_cible in cols_cible_filter):
            continue
        minute_col = cols_cible_minutes[index][index_cible]
        print(col_cible, ">>", minute_col)
        y = df_limited[minute_col]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        myCLASSIFIERS = CLASSIFIERS
        print(myCLASSIFIERS)
        # myCLASSIFIERS = myCLASSIFIERS[4:6]
        reg = LazyClassifier(
            verbose=2,
            ignore_warnings=False,
            custom_metric=None,
            classifiers=myCLASSIFIERS,
        )
        models, predictions = reg.fit(X_train, X_test, y_train, y_test)
        # Ajoute le nom de la colonne
        models["Target"] = minute_col
        # Concaténe dans les résultats
        all_results = pd.concat([all_results, models], axis=0)
        display(models)
        models.to_csv(f"../data/_autoML_classifier_{minute_col}.csv", sep=";", index=True)
        # break
    # break

all_results.to_csv(f"../data/_autoML_classifier.csv", sep=";", index=True)

# tester automl avant et après PCA et aussi gridsearch pour comparer
# save du PCA

[    -60      60     120     180     300     480     600     900 1200000]
[1, 2, 3, 5, 8, 10, 15, 30]
TurnoutTimeMinutes_min TurnoutTimeSeconds_min
cols_cible [['TurnoutTimeSeconds_min', 'TurnoutTimeSeconds_mean', 'TurnoutTimeSeconds_max'], ['TravelTimeSeconds_min', 'TravelTimeSeconds_mean', 'TravelTimeSeconds_max'], ['PumpSecondsOnSite_min', 'PumpSecondsOnSite_mean', 'PumpSecondsOnSite_max']]
cols_cible_minutes [['TurnoutTimeMinutes_min', 'TurnoutTimeSeconds_mean', 'TurnoutTimeSeconds_max'], ['TravelTimeSeconds_min', 'TravelTimeSeconds_mean', 'TravelTimeSeconds_max'], ['PumpSecondsOnSite_min', 'PumpSecondsOnSite_mean', 'PumpSecondsOnSite_max']]
[    -60      60     120     180     300     480     600     900 1200000]
[1, 2, 3, 5, 8, 10, 15, 30]
TurnoutTimeMinutes_mean TurnoutTimeSeconds_mean
cols_cible [['TurnoutTimeSeconds_min', 'TurnoutTimeSeconds_mean', 'TurnoutTimeSeconds_max'], ['TravelTimeSeconds_min', 'TravelTimeSeconds_mean', 'TravelTimeSeconds_max'], ['PumpSecondsOnSite_min',

Unnamed: 0,CalYear,HourOfCall_0,HourOfCall_1,HourOfCall_2,HourOfCall_3,HourOfCall_4,PropertyType_0,PropertyType_1,PropertyType_2,PropertyType_3,PropertyType_4,PropertyType_5,PropertyType_6,PropertyType_7,PropertyType_8,Postcode_district_0,Postcode_district_1,Postcode_district_2,Postcode_district_3,Postcode_district_4,NumPumpsAttending,StopCode_0,StopCode_1,StopCode_2,StopCode_3,StopCode_4,Month_0,Month_1,Month_2,Month_3,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max,TurnoutTimeMinutes_min,TurnoutTimeMinutes_mean,TurnoutTimeMinutes_max
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,2.0,0,0,0,0,1,0,0,0,1,0,0,1,240.0,390.0,540.0,253.0,253.0,253.0,89.0,89.0,89.0,5,5,5
1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1.0,0,0,0,1,0,0,0,0,1,0,0,1,420.0,420.0,420.0,151.0,151.0,151.0,157.0,157.0,157.0,3,3,3
2,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1.0,0,0,0,1,0,0,0,0,1,0,0,1,720.0,720.0,720.0,108.0,108.0,108.0,102.0,102.0,102.0,2,2,2
3,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,2.0,0,0,0,1,0,0,0,0,1,0,0,1,120.0,120.0,120.0,114.0,128.0,142.0,108.0,113.5,119.0,2,3,3
4,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,1,2.0,0,0,0,1,1,0,0,0,1,0,0,1,360.0,360.0,360.0,83.0,89.0,95.0,89.0,108.0,127.0,2,2,2
5,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1.0,0,0,0,0,1,0,0,0,1,0,0,1,420.0,420.0,420.0,119.0,119.0,119.0,403.0,403.0,403.0,2,2,2
6,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,2.0,0,0,0,1,0,0,0,0,1,0,0,1,1440.0,1440.0,1440.0,178.0,180.5,183.0,164.0,165.0,166.0,3,5,5
7,1,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1.0,0,0,0,1,0,0,0,0,1,0,0,1,420.0,420.0,420.0,121.0,121.0,121.0,134.0,134.0,134.0,3,3,3
8,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1.0,0,0,0,1,1,0,0,0,1,0,0,1,780.0,780.0,780.0,110.0,110.0,110.0,187.0,187.0,187.0,2,2,2
9,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1.0,0,0,1,0,0,0,0,0,1,0,0,1,600.0,600.0,600.0,129.0,129.0,129.0,567.0,567.0,567.0,3,3,3


(1016665, 45)
cols_to_remove ['TurnoutTimeSeconds_min', 'TurnoutTimeSeconds_mean', 'TurnoutTimeSeconds_max', 'TravelTimeSeconds_min', 'TravelTimeSeconds_mean', 'TravelTimeSeconds_max', 'PumpSecondsOnSite_min', 'PumpSecondsOnSite_mean', 'PumpSecondsOnSite_max', 'TurnoutTimeMinutes_min', 'TurnoutTimeMinutes_mean', 'TurnoutTimeMinutes_max', 'NumPumpsAttending']
cols_to_keep ['CalYear', 'HourOfCall_0', 'HourOfCall_1', 'HourOfCall_2', 'HourOfCall_3', 'HourOfCall_4', 'Postcode_district_0', 'Postcode_district_1', 'Postcode_district_2', 'Postcode_district_3', 'Postcode_district_4', 'Month_0', 'Month_1', 'Month_2', 'Month_3', 'DayOfWeek_0', 'DayOfWeek_1', 'DayOfWeek_2']


Unnamed: 0,CalYear,HourOfCall_0,HourOfCall_1,HourOfCall_2,HourOfCall_3,HourOfCall_4,Postcode_district_0,Postcode_district_1,Postcode_district_2,Postcode_district_3,Postcode_district_4,Month_0,Month_1,Month_2,Month_3,DayOfWeek_0,DayOfWeek_1,DayOfWeek_2
574862,7,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1
574863,7,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1


(1016665, 18)
TurnoutTimeSeconds_mean >> TurnoutTimeMinutes_mean
[('AdaBoostClassifier', <class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>), ('BaggingClassifier', <class 'sklearn.ensemble._bagging.BaggingClassifier'>), ('BernoulliNB', <class 'sklearn.naive_bayes.BernoulliNB'>), ('DecisionTreeClassifier', <class 'sklearn.tree._classes.DecisionTreeClassifier'>), ('DummyClassifier', <class 'sklearn.dummy.DummyClassifier'>), ('ExtraTreeClassifier', <class 'sklearn.tree._classes.ExtraTreeClassifier'>), ('ExtraTreesClassifier', <class 'sklearn.ensemble._forest.ExtraTreesClassifier'>), ('GaussianNB', <class 'sklearn.naive_bayes.GaussianNB'>), ('KNeighborsClassifier', <class 'sklearn.neighbors._classification.KNeighborsClassifier'>), ('LinearDiscriminantAnalysis', <class 'sklearn.discriminant_analysis.LinearDiscriminantAnalysis'>), ('LogisticRegression', <class 'sklearn.linear_model._logistic.LogisticRegression'>), ('MLPClassifier', <class 'sklearn.neural_network._multilayer_perce

  3%|▎         | 1/31 [00:26<13:26, 26.89s/it]

ROC AUC couldn't be calculated for AdaBoostClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'AdaBoostClassifier', 'Accuracy': 0.5668091259165998, 'Balanced Accuracy': np.float64(0.12257026673399839), 'ROC AUC': None, 'F1 Score': 0.4385301390884828, 'Time taken': 26.89079737663269}


  6%|▋         | 2/31 [00:48<11:27, 23.71s/it]

ROC AUC couldn't be calculated for BaggingClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'BaggingClassifier', 'Accuracy': 0.5751156969109785, 'Balanced Accuracy': np.float64(0.1363400634772959), 'ROC AUC': None, 'F1 Score': 0.5205440772916469, 'Time taken': 21.488060474395752}


 10%|▉         | 3/31 [00:49<06:14, 13.37s/it]

ROC AUC couldn't be calculated for BernoulliNB
multi_class must be in ('ovo', 'ovr')
{'Model': 'BernoulliNB', 'Accuracy': 0.5845780075049304, 'Balanced Accuracy': np.float64(0.125), 'ROC AUC': None, 'F1 Score': 0.43132170866932984, 'Time taken': 1.0546727180480957}


 13%|█▎        | 4/31 [00:52<04:07,  9.17s/it]

ROC AUC couldn't be calculated for DecisionTreeClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.5755337303831646, 'Balanced Accuracy': np.float64(0.1362583914763559), 'ROC AUC': None, 'F1 Score': 0.522021274599274, 'Time taken': 2.7248427867889404}


 16%|█▌        | 5/31 [00:52<02:39,  6.13s/it]

ROC AUC couldn't be calculated for DummyClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'DummyClassifier', 'Accuracy': 0.5845780075049304, 'Balanced Accuracy': np.float64(0.125), 'ROC AUC': None, 'F1 Score': 0.43132170866932984, 'Time taken': 0.7378199100494385}


 19%|█▉        | 6/31 [00:54<01:53,  4.55s/it]

ROC AUC couldn't be calculated for ExtraTreeClassifier
multi_class must be in ('ovo', 'ovr')
{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.5755337303831646, 'Balanced Accuracy': np.float64(0.1362583914763559), 'ROC AUC': None, 'F1 Score': 0.522021274599274, 'Time taken': 1.4903266429901123}
