In [17]:
# ------------------------- Préparation des informations de réduction des valeurs catégorielles -----------------------------

import pandas as pd
import statsmodels.api
from statsmodels.api import stats
import statsmodels.formula.api as smf

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [18]:
def load_incidents():
    df = pd.read_csv("../data/PreProcess.csv", sep=";", low_memory=False)
    display(df.head())
    display(df.info())
    return df


In [19]:
# Traitmeent des catégories StopCode, PropertyType, Postcode_district
# Pour conserver les valeurs catégorielles en fonction d'un 
# Test de Anova, pour chacune des cibles

cols_cible_type = ["PumpSecondsOnSite", "TurnoutTimeSeconds", "TravelTimeSeconds"]

def calc_anova_for_all_targets(df):
    # Catégories
    cols_categ = [
        "StopCode",
        "PropertyType",
        "Postcode_district",
    ]
    # Cibles à caluler
    cols_cible = [
        ["PumpSecondsOnSite_min", "PumpSecondsOnSite_mean", "PumpSecondsOnSite_max"],
        ["TurnoutTimeSeconds_min", "TurnoutTimeSeconds_mean", "TurnoutTimeSeconds_max"],
        ["TravelTimeSeconds_min", "TravelTimeSeconds_mean", "TravelTimeSeconds_max"],
    ]

    # Nb de valeurs actuelles par catégories
    for col in cols_categ:
        print(col, len(df[col].unique()))
    print()

    # Test de Anova sur les données après 2016, pour réduire le temps de calcul
    df_temp = df[df.CalYear.astype("int") > 2016]

    # Test de Anova sur chaque colonne cible avec toutes les colonnes catégorielles
    all_result = []
    # Formatage des colonnes catégorielles pour le test de Anova
    cols_to_check = "+".join(cols_categ)
    # Pour chaque tableau, par type de cible 
    # (exemple PumpSecondsOnSite, calcul pour min, mean, max)
    for cols in cols_cible:
        cible_results = []
        # Pour chaque valeur à calculer (min, max, mean)
        for col in cols:
            print("Test de", col)
            # Test de Anova pour une cible, par exemple PumpSecondsOnSite_min
            results = smf.ols(f"{col} ~ {cols_to_check}", data=df_temp).fit()
            # table = stats.anova_lm(result)
            # display(table)
            # Création d'un Dataframe du résultat avec le coefficient et la pvalue
            results_df = pd.DataFrame(
                {
                    f"{col} coefficients": round(results.params, 3),
                    f"{col} pvalues": round(results.pvalues, 3),
                }
            )
            # Ajoute le résultat au tableau par type de cible
            cible_results.append(results_df)
            # display(results_df.head(30))
            # print("Test de", col, "ok")
        # Ajoute le résultat par type de cible aux résultats globaux
        all_result.append(cible_results)

    # Contrôle
    for cible_results in all_result:
        for result in cible_results:
            display(result.head(5))

    # Merge les resultats dans un dataFrame, en gardant un Dataframe par type de cible
    # et en mettant les résultats pvalue et codef par min, mean et max en colonne
    compiled_results = []
    for cible_results in all_result:
        cible_result = cible_results[0]
        i = 0
        for result in cible_results:
            if i == 0:
                print("continue")
                i += 1
                continue
            print("merge")
            cible_result = pd.merge(
                cible_result, result, left_index=True, right_index=True, how="outer"
            )
        compiled_results.append(cible_result)

    return compiled_results



# Ajoute 2 colonnes Discord et Keep dans les DF pour finaliser le choix 
# des valeurs catégorielles à conserver 
def calculate_keeps_by_target(compiled_results):
    print("Calcule Discords")

    # Regarde si la pvalue est la même par valeur catégorielle pour min, max et mean 
    # Ajout de la colonne Discord : True si il y a des delta
    # Filtre les lignes avec que des pvalue > 0.05 ou que que des pvalue < 0.05
    # pour voir si certaines interviennent que dans certains cas
    # Utiliser uniquement pour contrôle 
    def TreatDiscordRow(columns, row):
        keeps = []
        nb = 0
        for i in range(1, len(columns), 2):
            # print("col", columns[i])
            # print(row)
            # Crée un tablea à 3 bool qui indique si <= 0.05 ou non 
            keeps.append(row[columns[i]] <= 0.05)
            nb += 1
        # Somme des 3 bool, si le résultat donne 3 ou 0 alors les 3 pvalue
        # sont uniformes, sinon non, donc Discord
        val = not (sum(keeps) == 0) | (sum(keeps) == nb)  # sum(keeps)
        # print(val)
        return val

    # Calcul final pour savoir si la valeur caétorielle est conservée pour 
    # le type de cible. Oui si toutes les pvalues sont >= 0.05 ou si 
    # meam et max le sont
    def TreatKeepRow(columns, row):
        keeps = []
        nb = 0
        for i in range(1, len(columns), 2):
            # print("col", columns[i])
            # print(row[columns[i]])
            keeps.append(row[columns[i]] <= 0.05)
            nb += 1
        val = sum(keeps)
        # si toutes les pvalues sont non significatives
        if val == 0:
            return False
        # si toutes les pvalues sont significatives
        elif val == nb:
            return True
        # sinon, conserve si c'est significatif sur la moyenne ou max
        else:
            mean_col = [s for s in columns if "mean pvalues" in s][0]
            max_col = [s for s in columns if "max pvalues" in s][0]            
            mean = row[mean_col]
            max = row[max_col]
            keep = True if (mean <= 0.05) | (max <= 0.05) else False
            print(mean_col, mean, max_col, max, keep)
            # print("mean_col", mean_col)
            # print("mean_col", val)
            return keep

    # Pour chaque type de cible (ex : PumpSecondsOnSite) on a un DF avec les 
    # pvalue par cible min, max, mean et une ligne par valezur de catégorie
    for cible_results in compiled_results:
        # display(cible_results.head())
        # Ajout d'une colonne Discord qui indique si les pvalue sont toutes
        # <0.05 ou > 0.05, donc un Keep évident
        cible_results["Discord"] = cible_results.apply(
            lambda row: TreatDiscordRow(cible_results.columns, row), axis=1
        )
        # Calcul si on garde ou non la veleur catégorielle pour un type de cible
        cible_results["Keep"] = cible_results.apply(
            lambda row: TreatKeepRow(cible_results.columns, row), axis=1
        )
        # Contrôle
        display(cible_results[cible_results.Discord == True].head())
    # Résumé des données conserver par type de cible
    for i in range(0, len(compiled_results)):
        print(
            f"{cols_cible_type[i]} {compiled_results[i]["Discord"].sum()} discords, {compiled_results[i]["Keep"].sum()} keeps, sur {len(compiled_results[i])}, "
            f"conserve {round(compiled_results[i]["Keep"].sum() / len(compiled_results[i]) * 100, 0)}%"
        )
    return compiled_results

In [20]:
def save_categorical_reduction(compiled_results):
    def SplitCatValue(s, return_name):
        result = s.split("[")
        return result[0] if return_name else result[1][2:-1]

    # Sauve les infos des valeurs catégorielles 
    for i in range(0, len(compiled_results)):
        # Conserve uniquement la valeur Keep, qui est le résultat final
        # L'index étant la valeur catégorielle
        df_to_save = compiled_results[i][["Keep"]]
        # supprime la ligne Intercept
        df_to_save = df_to_save.drop(
            df_to_save[df_to_save.index.str.upper() == "INTERCEPT"].index
        )
        # Les valeurs de df_keep ont le format
        #           AddressQualifier[T.IN STREET CLOSE TO GAZETTEER LOCATION]	True (colonne Keep)
        # Crée 2 colonnes, nom de colonne et valeur
        # Prend l'index
        df_to_save["Column"] = df_to_save.index
        # Split l'index et retourne la 1ere partie qui est le nom de la catégorie
        df_to_save["Column"] = df_to_save["Column"].apply(lambda s: SplitCatValue(s, True))
        df_to_save["Value"] = df_to_save.index
        # Split l'index et retourne la 2eme partie qui est la valeur catégorielle
        df_to_save["Value"] = df_to_save["Value"].apply(lambda s: SplitCatValue(s, False))
        display(df_to_save.head())
        # Sauvegarde le DF dans un fichier
        name = f"keep {cols_cible_type[i]}"
        print(name)
        df_to_save.to_csv(f"../data/{name}.csv", sep=";", index=False)

In [21]:
df = load_incidents()
compiled_results = calc_anova_for_all_targets(df)
compiled_results = calculate_keeps_by_target(compiled_results)
save_categorical_reduction(compiled_results)
# 10 minutes

Unnamed: 0,IncidentNumber,CalYear,HourOfCall,PropertyType,Postcode_district,FirstPumpArriving_DeployedFromStation,NumPumpsAttending,StopCode,Month,DayOfWeek,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max
0,235138081,2009,0,CAR,SW11,BATTERSEA,2.0,SST-RTC,1,4,240.0,390.0,540.0,253.0,253.0,253.0,89.0,89.0,89.0
1,2091,2009,0,ROAD SURFACE/PAVEMENT,N9,EDMONTON,1.0,SECONDARY FIRE,1,4,420.0,420.0,420.0,151.0,151.0,151.0,157.0,157.0,157.0
2,3091,2009,0,DOMESTIC GARDEN (VEGETATION NOT EQUIPMENT),UB10,HILLINGDON,1.0,SECONDARY FIRE,1,4,720.0,720.0,720.0,108.0,108.0,108.0,102.0,102.0,102.0
3,5091,2009,0,CYCLE PATH/PUBLIC FOOTPATH/BRIDLEWAY,N7,HOLLOWAY,2.0,SECONDARY FIRE,1,4,120.0,120.0,120.0,114.0,128.0,142.0,108.0,113.5,119.0
4,6091,2009,0,PURPOSE BUILT FLATS/MAISONETTES - UP TO 3 STOREYS,NW5,KENTISH TOWN,2.0,AFA,1,4,360.0,360.0,360.0,83.0,89.0,95.0,89.0,108.0,127.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1591519 entries, 0 to 1591518
Data columns (total 19 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   IncidentNumber                         1591519 non-null  object 
 1   CalYear                                1591519 non-null  int64  
 2   HourOfCall                             1591519 non-null  int64  
 3   PropertyType                           1591519 non-null  object 
 4   Postcode_district                      1591519 non-null  object 
 5   FirstPumpArriving_DeployedFromStation  1591519 non-null  object 
 6   NumPumpsAttending                      1591519 non-null  float64
 7   StopCode                               1591519 non-null  object 
 8   Month                                  1591519 non-null  int64  
 9   DayOfWeek                              1591519 non-null  int64  
 10  PumpSecondsOnSite_min                  159

None

StopCode 28
PropertyType 292
Postcode_district 324

Test de PumpSecondsOnSite_min
Test de PumpSecondsOnSite_mean
Test de PumpSecondsOnSite_max
Test de TurnoutTimeSeconds_min
Test de TurnoutTimeSeconds_mean
Test de TurnoutTimeSeconds_max
Test de TravelTimeSeconds_min
Test de TravelTimeSeconds_mean
Test de TravelTimeSeconds_max


Unnamed: 0,PumpSecondsOnSite_min coefficients,PumpSecondsOnSite_min pvalues
Intercept,720.31,0.245
StopCode[T.CHIMNEY FIRE],2909.027,0.0
StopCode[T.FALSE ALARM - GOOD INTENT],168.508,0.0
StopCode[T.FALSE ALARM - MALICIOUS],107.859,0.0
StopCode[T.LATE CALL],1278.401,0.0


Unnamed: 0,PumpSecondsOnSite_mean coefficients,PumpSecondsOnSite_mean pvalues
Intercept,539.713,0.405
StopCode[T.CHIMNEY FIRE],3075.949,0.0
StopCode[T.FALSE ALARM - GOOD INTENT],209.92,0.0
StopCode[T.FALSE ALARM - MALICIOUS],122.294,0.0
StopCode[T.LATE CALL],1189.866,0.0


Unnamed: 0,PumpSecondsOnSite_max coefficients,PumpSecondsOnSite_max pvalues
Intercept,212.292,0.771
StopCode[T.CHIMNEY FIRE],3220.292,0.0
StopCode[T.FALSE ALARM - GOOD INTENT],254.143,0.0
StopCode[T.FALSE ALARM - MALICIOUS],136.048,0.0
StopCode[T.LATE CALL],1086.063,0.0


Unnamed: 0,TurnoutTimeSeconds_min coefficients,TurnoutTimeSeconds_min pvalues
Intercept,86.776,0.0
StopCode[T.CHIMNEY FIRE],2.49,0.402
StopCode[T.FALSE ALARM - GOOD INTENT],1.652,0.0
StopCode[T.FALSE ALARM - MALICIOUS],3.136,0.0
StopCode[T.LATE CALL],16.542,0.003


Unnamed: 0,TurnoutTimeSeconds_mean coefficients,TurnoutTimeSeconds_mean pvalues
Intercept,103.41,0.0
StopCode[T.CHIMNEY FIRE],1.262,0.676
StopCode[T.FALSE ALARM - GOOD INTENT],1.369,0.0
StopCode[T.FALSE ALARM - MALICIOUS],2.909,0.0
StopCode[T.LATE CALL],5.942,0.299


Unnamed: 0,TurnoutTimeSeconds_max coefficients,TurnoutTimeSeconds_max pvalues
Intercept,121.902,0.0
StopCode[T.CHIMNEY FIRE],-0.46,0.898
StopCode[T.FALSE ALARM - GOOD INTENT],1.235,0.0
StopCode[T.FALSE ALARM - MALICIOUS],2.864,0.0
StopCode[T.LATE CALL],-5.344,0.433


Unnamed: 0,TravelTimeSeconds_min coefficients,TravelTimeSeconds_min pvalues
Intercept,445.025,0.0
StopCode[T.CHIMNEY FIRE],2.059,0.838
StopCode[T.FALSE ALARM - GOOD INTENT],12.353,0.0
StopCode[T.FALSE ALARM - MALICIOUS],4.146,0.002
StopCode[T.LATE CALL],38.344,0.045


Unnamed: 0,TravelTimeSeconds_mean coefficients,TravelTimeSeconds_mean pvalues
Intercept,499.387,0.0
StopCode[T.CHIMNEY FIRE],-2.072,0.842
StopCode[T.FALSE ALARM - GOOD INTENT],14.355,0.0
StopCode[T.FALSE ALARM - MALICIOUS],5.931,0.0
StopCode[T.LATE CALL],-2.839,0.885


Unnamed: 0,TravelTimeSeconds_max coefficients,TravelTimeSeconds_max pvalues
Intercept,547.525,0.0
StopCode[T.CHIMNEY FIRE],-6.108,0.615
StopCode[T.FALSE ALARM - GOOD INTENT],17.2,0.0
StopCode[T.FALSE ALARM - MALICIOUS],8.682,0.0
StopCode[T.LATE CALL],-44.424,0.054


continue
merge
merge
continue
merge
merge
continue
merge
merge
Calcule Discords
PumpSecondsOnSite_mean pvalues 0.079 PumpSecondsOnSite_max pvalues 0.14 False
PumpSecondsOnSite_mean pvalues 0.142 PumpSecondsOnSite_max pvalues 0.753 False
PumpSecondsOnSite_mean pvalues 0.055 PumpSecondsOnSite_max pvalues 0.298 False
PumpSecondsOnSite_mean pvalues 0.263 PumpSecondsOnSite_max pvalues 0.016 True
PumpSecondsOnSite_mean pvalues 0.305 PumpSecondsOnSite_max pvalues 0.924 False
PumpSecondsOnSite_mean pvalues 0.127 PumpSecondsOnSite_max pvalues 0.672 False
PumpSecondsOnSite_mean pvalues 0.275 PumpSecondsOnSite_max pvalues 0.973 False
PumpSecondsOnSite_mean pvalues 0.175 PumpSecondsOnSite_max pvalues 0.699 False
PumpSecondsOnSite_mean pvalues 0.138 PumpSecondsOnSite_max pvalues 0.588 False
PumpSecondsOnSite_mean pvalues 0.293 PumpSecondsOnSite_max pvalues 0.962 False
PumpSecondsOnSite_mean pvalues 0.058 PumpSecondsOnSite_max pvalues 0.191 False
PumpSecondsOnSite_mean pvalues 0.048 PumpSecondsOnSit

Unnamed: 0,PumpSecondsOnSite_min coefficients,PumpSecondsOnSite_min pvalues,PumpSecondsOnSite_mean coefficients,PumpSecondsOnSite_mean pvalues,PumpSecondsOnSite_max coefficients,PumpSecondsOnSite_max pvalues,Discord,Keep
Postcode_district[T.KT22],1421.2,0.036,1246.963,0.079,1178.194,0.14,True,False
PropertyType[T.AIRFIELD/RUNWAY],-1064.185,0.007,-606.963,0.142,146.913,0.753,True,False
PropertyType[T.BARBECUE],-948.195,0.007,-706.137,0.055,-430.398,0.298,True,False
PropertyType[T.BARGE],-170.091,0.635,418.967,0.263,1012.678,0.016,True,True
PropertyType[T.BEACH],-816.967,0.049,-445.139,0.305,-46.733,0.924,True,False


TurnoutTimeSeconds_mean pvalues 0.015 TurnoutTimeSeconds_max pvalues 0.001 True
TurnoutTimeSeconds_mean pvalues 0.003 TurnoutTimeSeconds_max pvalues 0.001 True
TurnoutTimeSeconds_mean pvalues 0.057 TurnoutTimeSeconds_max pvalues 0.026 True
TurnoutTimeSeconds_mean pvalues 0.004 TurnoutTimeSeconds_max pvalues 0.0 True
TurnoutTimeSeconds_mean pvalues 0.002 TurnoutTimeSeconds_max pvalues 0.0 True
TurnoutTimeSeconds_mean pvalues 0.001 TurnoutTimeSeconds_max pvalues 0.0 True
TurnoutTimeSeconds_mean pvalues 0.009 TurnoutTimeSeconds_max pvalues 0.001 True
TurnoutTimeSeconds_mean pvalues 0.003 TurnoutTimeSeconds_max pvalues 0.0 True
TurnoutTimeSeconds_mean pvalues 0.048 TurnoutTimeSeconds_max pvalues 0.011 True
TurnoutTimeSeconds_mean pvalues 0.006 TurnoutTimeSeconds_max pvalues 0.001 True
TurnoutTimeSeconds_mean pvalues 0.055 TurnoutTimeSeconds_max pvalues 0.011 True
TurnoutTimeSeconds_mean pvalues 0.035 TurnoutTimeSeconds_max pvalues 0.005 True
TurnoutTimeSeconds_mean pvalues 0.003 TurnoutTim

Unnamed: 0,TurnoutTimeSeconds_min coefficients,TurnoutTimeSeconds_min pvalues,TurnoutTimeSeconds_mean coefficients,TurnoutTimeSeconds_mean pvalues,TurnoutTimeSeconds_max coefficients,TurnoutTimeSeconds_max pvalues,Discord,Keep
PropertyType[T.AIRPORT - HANGAR],-9.466,0.372,-26.232,0.015,-42.42,0.001,True,True
PropertyType[T.AIRPORT BUILDING (NOT TERMINAL OR HANGAR)],-18.43,0.067,-30.268,0.003,-40.649,0.001,True,True
PropertyType[T.ANIMAL BOARDING/BREEDING ESTABLISHMENT - CATS],-17.759,0.25,-29.904,0.057,-41.776,0.026,True,True
PropertyType[T.ANIMAL PRODUCTS PROCESSING PLANT],-13.426,0.211,-31.173,0.004,-48.819,0.0,True,True
PropertyType[T.ART GALLERY],-16.059,0.108,-31.125,0.002,-45.886,0.0,True,True


TravelTimeSeconds_mean pvalues 0.035 TravelTimeSeconds_max pvalues 0.039 True
TravelTimeSeconds_mean pvalues 0.028 TravelTimeSeconds_max pvalues 0.026 True
TravelTimeSeconds_mean pvalues 0.023 TravelTimeSeconds_max pvalues 0.028 True
TravelTimeSeconds_mean pvalues 0.029 TravelTimeSeconds_max pvalues 0.074 True
TravelTimeSeconds_mean pvalues 0.11 TravelTimeSeconds_max pvalues 0.332 False
TravelTimeSeconds_mean pvalues 0.039 TravelTimeSeconds_max pvalues 0.036 True
TravelTimeSeconds_mean pvalues 0.033 TravelTimeSeconds_max pvalues 0.049 True
TravelTimeSeconds_mean pvalues 0.956 TravelTimeSeconds_max pvalues 0.03 True
TravelTimeSeconds_mean pvalues 0.046 TravelTimeSeconds_max pvalues 0.06 True
TravelTimeSeconds_mean pvalues 0.037 TravelTimeSeconds_max pvalues 0.043 True
TravelTimeSeconds_mean pvalues 0.024 TravelTimeSeconds_max pvalues 0.029 True
TravelTimeSeconds_mean pvalues 0.03 TravelTimeSeconds_max pvalues 0.037 True
TravelTimeSeconds_mean pvalues 0.048 TravelTimeSeconds_max pvalues 

Unnamed: 0,TravelTimeSeconds_min coefficients,TravelTimeSeconds_min pvalues,TravelTimeSeconds_mean coefficients,TravelTimeSeconds_mean pvalues,TravelTimeSeconds_max coefficients,TravelTimeSeconds_max pvalues,Discord,Keep
Postcode_district[T.BR2],-85.331,0.084,-107.061,0.035,-122.666,0.039,True,True
Postcode_district[T.BR5],-86.223,0.081,-112.04,0.028,-132.425,0.026,True,True
Postcode_district[T.BR6],-95.624,0.053,-115.992,0.023,-130.986,0.028,True,True
Postcode_district[T.CM13],151.751,0.015,140.744,0.029,134.428,0.074,True,True
Postcode_district[T.CM14],115.225,0.027,85.604,0.11,60.684,0.332,True,False


PumpSecondsOnSite 90 discords, 87 keeps, sur 618, conserve 14.0%
TurnoutTimeSeconds 142 discords, 288 keeps, sur 618, conserve 47.0%
TravelTimeSeconds 149 discords, 432 keeps, sur 618, conserve 70.0%


Unnamed: 0,Keep,Column,Value
Postcode_district[T.BR1],False,Postcode_district,BR1
Postcode_district[T.BR2],False,Postcode_district,BR2
Postcode_district[T.BR3],False,Postcode_district,BR3
Postcode_district[T.BR4],False,Postcode_district,BR4
Postcode_district[T.BR5],False,Postcode_district,BR5


keep PumpSecondsOnSite


Unnamed: 0,Keep,Column,Value
Postcode_district[T.BR1],False,Postcode_district,BR1
Postcode_district[T.BR2],False,Postcode_district,BR2
Postcode_district[T.BR3],False,Postcode_district,BR3
Postcode_district[T.BR4],False,Postcode_district,BR4
Postcode_district[T.BR5],False,Postcode_district,BR5


keep TurnoutTimeSeconds


Unnamed: 0,Keep,Column,Value
Postcode_district[T.BR1],True,Postcode_district,BR1
Postcode_district[T.BR2],True,Postcode_district,BR2
Postcode_district[T.BR3],True,Postcode_district,BR3
Postcode_district[T.BR4],False,Postcode_district,BR4
Postcode_district[T.BR5],True,Postcode_district,BR5


keep TravelTimeSeconds
