In [8]:
import pandas as pd

# Ajuster les paramètres pour afficher toutes les lignes et colonnes
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)



In [9]:
# ------------------------- Incidents -----------------------------

def load_incidents():
    # Chargement des 2 dataset
    df1 = pd.read_csv("../data/LFB Incident data from 2009 - 2017.csv", sep=",", low_memory=False)
    df2 = pd.read_csv("../data/LFB Incident data from 2018 onwards.csv", sep=",", low_memory=False)
    # concaténation
    incident_df = pd.concat([df1, df2], axis=0)
    # reset index
    incident_df = incident_df.reset_index(drop=True)
    # suppression des ".00" sur certaines lignes de "IncidentNumber"
    incident_df["IncidentNumber"] = incident_df["IncidentNumber"].apply(
        lambda x: str(x)[0:-3] if (".00" in x) & (str(x)[-3:] == ".00") else x
    )
    print(df1.shape, df2.shape, incident_df.shape)
    return incident_df

def TreatText(s):
    if not pd.isna(s):
        s = s.upper().strip()
    return s

def preprocess_incidents_types(incident_df):
    # formatage des dates
    incident_df.DateOfCall = pd.to_datetime(incident_df.DateOfCall, format="%d-%b-%y")
    incident_df.TimeOfCall = pd.to_datetime(incident_df.TimeOfCall, format="%H:%M:%S")
    # formatage des textes
    incident_df.StopCodeDescription = incident_df.StopCodeDescription.apply(lambda s: TreatText(s))
    incident_df.SpecialServiceType = incident_df.SpecialServiceType.apply(lambda s: TreatText(s))
    incident_df.PropertyType = incident_df.PropertyType.apply(lambda s: TreatText(s))
    incident_df.AddressQualifier = incident_df.AddressQualifier.apply(lambda s: TreatText(s))
    incident_df.Postcode_district = incident_df.Postcode_district.apply(lambda s: TreatText(s))
    incident_df.IncGeo_BoroughCode = incident_df.IncGeo_BoroughCode.apply(lambda s: TreatText(s))
    incident_df.IncGeo_WardCode = incident_df.IncGeo_WardCode.apply(lambda s: TreatText(s))
    incident_df.FirstPumpArriving_DeployedFromStation = (incident_df.FirstPumpArriving_DeployedFromStation.apply(lambda s: TreatText(s))    )
    display(incident_df.head())
    display(incident_df.info())
    return incident_df

def preprocess_incidents_StopCodeDescription(incident_df):
    # Merge de SpecialService dans Stopcode
    s = "Special Service".upper().strip()
    incident_df["StopCode"] = incident_df.apply(
        lambda row: (
            "SST-" + row["SpecialServiceType"]
            if row["StopCodeDescription"] == s
            else row["StopCodeDescription"]
        ),
        axis=1,
    )
    # dans Stop code, remplace les fausses alarmes par alarm
    incident_df["StopCode"] = incident_df["StopCode"].replace(to_replace=["AFA", "False alarm - Good intent".upper(), "False alarm - Malicious".upper()], value="ALARM")
    display(incident_df.head())
    return incident_df

def preprocess_incidents_DateOfCall(incident_df):
    # création de champs Mois et DayOfWeek
    incident_df["Month"] = incident_df["DateOfCall"].dt.month
    incident_df["DayOfWeek"] = incident_df["DateOfCall"].dt.dayofweek + 1
    return incident_df

def preprocess_incidents_drop_cols(incident_df):
# suppression des données inutiles, en doublon métier, ou non présente au moment de la prédiction
    incident_df = incident_df.drop(
        [
            "DateOfCall",
            "TimeOfCall",
            "IncidentGroup",
            "StopCodeDescription",
            "SpecialServiceType",
            "PropertyCategory",
            "AddressQualifier",
            # Geo
            "Postcode_full",
            "UPRN",
            "USRN",
            "IncGeo_BoroughCode",
            "IncGeo_BoroughName",
            "ProperCase",
            "IncGeo_WardCode",
            "IncGeo_WardName",
            "IncGeo_WardNameNew",
            "Easting_m",
            "Northing_m",
            "Easting_rounded",
            "Northing_rounded",
            "Latitude",
            "Longitude",
            #
            "FRS",
            "IncidentStationGround",
            "FirstPumpArriving_AttendanceTime",
            "FirstPumpArriving_DeployedFromStation",
            "SecondPumpArriving_AttendanceTime",
            "SecondPumpArriving_DeployedFromStation",
            "NumStationsWithPumpsAttending",
            "PumpCount",
            "PumpMinutesRounded",
            "Notional Cost (£)",
            "NumCalls",
        ],
        axis=1,
    )
    return incident_df

In [10]:
incident_df = load_incidents()
incident_df = preprocess_incidents_types(incident_df)
incident_df = preprocess_incidents_StopCodeDescription(incident_df)
incident_df = preprocess_incidents_DateOfCall(incident_df)
incident_df = preprocess_incidents_drop_cols(incident_df)

# sauvegarde d'un fichier temporaire
incident_df.to_csv("../data/PreIncidents.csv", sep=";", index=False)

(988279, 39) (783134, 39) (1771413, 39)


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls
0,235138081,2009-01-01,2009,1900-01-01 00:00:37,0,Special Service,SPECIAL SERVICE,RTC,Road Vehicle,CAR,IN STREET CLOSE TO GAZETTEER LOCATION,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652.0,176830.0,528650,176850,51.475812,-0.148894,London,Battersea,319.0,BATTERSEA,342.0,Clapham,2.0,2.0,2,60,255,1.0
1,1091,2009-01-01,2009,1900-01-01 00:00:46,0,Special Service,SPECIAL SERVICE,ASSIST OTHER AGENCIES,Outdoor,LAKE/POND/RESERVOIR,OPEN LAND/WATER - NEAREST GAZETTEER LOCATION,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485.0,179007.0,530450,179050,51.494957,-0.121712,London,Lambeth,,,,,,,1,60,255,1.0
2,2091,2009-01-01,2009,1900-01-01 00:03:00,0,Fire,SECONDARY FIRE,,Outdoor,ROAD SURFACE/PAVEMENT,IN STREET OUTSIDE GAZETTEER LOCATION,N9 9EL,N9,,,E09000010,ENFIELD,Enfield,E05013682,Haselbury,Haselbury,533773.0,194492.0,533750,194450,51.633342,-0.068488,London,Edmonton,308.0,EDMONTON,,,1.0,1.0,1,60,255,2.0
3,3091,2009-01-01,2009,1900-01-01 00:04:27,0,Fire,SECONDARY FIRE,,Outdoor,DOMESTIC GARDEN (VEGETATION NOT EQUIPMENT),ON LAND ASSOCIATED WITH BUILDING,UB10 0DG,UB10,100021500000.0,21401491.0,E09000017,HILLINGDON,Hillingdon,E05013571,Hillingdon East,Hillingdon East,507738.0,182805.0,507750,182850,51.533882,-0.448089,London,Hillingdon,210.0,HILLINGDON,,,1.0,1.0,1,60,255,2.0
4,5091,2009-01-01,2009,1900-01-01 00:05:39,0,Fire,SECONDARY FIRE,,Outdoor,CYCLE PATH/PUBLIC FOOTPATH/BRIDLEWAY,IN STREET OUTSIDE GAZETTEER LOCATION,N7 8HG,N7,,,E09000019,ISLINGTON,Islington,E05013708,Laycock,Laycock,531058.0,185307.0,531050,185350,51.551441,-0.11112,London,Holloway,233.0,HOLLOWAY,250.0,Holloway,1.0,2.0,2,60,255,1.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771413 entries, 0 to 1771412
Data columns (total 39 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   IncidentNumber                          object        
 1   DateOfCall                              datetime64[ns]
 2   CalYear                                 int64         
 3   TimeOfCall                              datetime64[ns]
 4   HourOfCall                              int64         
 5   IncidentGroup                           object        
 6   StopCodeDescription                     object        
 7   SpecialServiceType                      object        
 8   PropertyCategory                        object        
 9   PropertyType                            object        
 10  AddressQualifier                        object        
 11  Postcode_full                           object        
 12  Postcode_district                       ob

None

Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls,StopCode
0,235138081,2009-01-01,2009,1900-01-01 00:00:37,0,Special Service,SPECIAL SERVICE,RTC,Road Vehicle,CAR,IN STREET CLOSE TO GAZETTEER LOCATION,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652.0,176830.0,528650,176850,51.475812,-0.148894,London,Battersea,319.0,BATTERSEA,342.0,Clapham,2.0,2.0,2,60,255,1.0,SST-RTC
1,1091,2009-01-01,2009,1900-01-01 00:00:46,0,Special Service,SPECIAL SERVICE,ASSIST OTHER AGENCIES,Outdoor,LAKE/POND/RESERVOIR,OPEN LAND/WATER - NEAREST GAZETTEER LOCATION,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485.0,179007.0,530450,179050,51.494957,-0.121712,London,Lambeth,,,,,,,1,60,255,1.0,SST-ASSIST OTHER AGENCIES
2,2091,2009-01-01,2009,1900-01-01 00:03:00,0,Fire,SECONDARY FIRE,,Outdoor,ROAD SURFACE/PAVEMENT,IN STREET OUTSIDE GAZETTEER LOCATION,N9 9EL,N9,,,E09000010,ENFIELD,Enfield,E05013682,Haselbury,Haselbury,533773.0,194492.0,533750,194450,51.633342,-0.068488,London,Edmonton,308.0,EDMONTON,,,1.0,1.0,1,60,255,2.0,SECONDARY FIRE
3,3091,2009-01-01,2009,1900-01-01 00:04:27,0,Fire,SECONDARY FIRE,,Outdoor,DOMESTIC GARDEN (VEGETATION NOT EQUIPMENT),ON LAND ASSOCIATED WITH BUILDING,UB10 0DG,UB10,100021500000.0,21401491.0,E09000017,HILLINGDON,Hillingdon,E05013571,Hillingdon East,Hillingdon East,507738.0,182805.0,507750,182850,51.533882,-0.448089,London,Hillingdon,210.0,HILLINGDON,,,1.0,1.0,1,60,255,2.0,SECONDARY FIRE
4,5091,2009-01-01,2009,1900-01-01 00:05:39,0,Fire,SECONDARY FIRE,,Outdoor,CYCLE PATH/PUBLIC FOOTPATH/BRIDLEWAY,IN STREET OUTSIDE GAZETTEER LOCATION,N7 8HG,N7,,,E09000019,ISLINGTON,Islington,E05013708,Laycock,Laycock,531058.0,185307.0,531050,185350,51.551441,-0.11112,London,Holloway,233.0,HOLLOWAY,250.0,Holloway,1.0,2.0,2,60,255,1.0,SECONDARY FIRE


In [11]:
# ------------------------- Mobilisations -----------------------------

def load_mobilisations():
    # Chargement des 3 dataset
    df1 = pd.read_csv(
        "../data/LFB Mobilisation data from January 2009 - 2014.csv",
        sep=";",
        low_memory=False,
    )
    df2 = pd.read_csv(
        "../data/LFB Mobilisation data from 2015 - 2020.csv", sep=";", low_memory=False
    )
    df3 = pd.read_csv(
        "../data/LFB Mobilisation data from 2021 - 2024.csv",
        sep=",",
        low_memory=False,
        usecols=lambda column: column not in ["BoroughName", "WardName"],
    )
    # concaténation
    mobilisations_df = pd.concat([df1, df2, df3], axis=0)
    # reset index
    mobilisations_df = mobilisations_df.reset_index(drop=True)
    return mobilisations_df


def preprocess_mobilisations_types(mobilisations_df):
    # formatage des dates
    mobilisations_df.DateAndTimeMobilised = pd.to_datetime(
        mobilisations_df.DateAndTimeMobilised, format="%d/%m/%Y %H:%M"
    )
    mobilisations_df.DateAndTimeMobile = pd.to_datetime(
        mobilisations_df.DateAndTimeMobile, format="%d/%m/%Y %H:%M"
    )
    mobilisations_df.DateAndTimeArrived = pd.to_datetime(
        mobilisations_df.DateAndTimeArrived, format="%d/%m/%Y %H:%M"
    )
    mobilisations_df.DateAndTimeLeft = pd.to_datetime(
        mobilisations_df.DateAndTimeLeft, format="%d/%m/%Y %H:%M"
    )
    mobilisations_df.DateAndTimeReturned = pd.to_datetime(
        mobilisations_df.DateAndTimeReturned, format="%d/%m/%Y %H:%M"
    )
    display(mobilisations_df.head())
    display(mobilisations_df.info())
    return mobilisations_df

def preprocess_mobilisations_times(mobilisations_df):
    # création de la colonne Temps sur site en seconde de la pompe par delta de temps arrivée et départ
    mobilisations_df["PumpOnSiteSeconds"] = (mobilisations_df.DateAndTimeLeft - mobilisations_df.DateAndTimeArrived).dt.seconds
    display(mobilisations_df.head(3))
    # Regroupe les lignes par incidents, car il y a une ligne par incidents / camion avec un n° d'ordre
    # Comme on s'intéresse à prédire des temps, on calcule les temps par incidents, min, max et moyen
    # et on cherchera à prédire des tgemps
    # les infos de 1ere station sur place et nb de pompe seront prises à partir de incidents
    aggregated = (
        mobilisations_df.groupby("IncidentNumber")
        .agg(
            PumpSecondsOnSite_min=("PumpOnSiteSeconds", "min"),
            PumpSecondsOnSite_mean=("PumpOnSiteSeconds", "median"),
            PumpSecondsOnSite_max=("PumpOnSiteSeconds", "max"),
            TurnoutTimeSeconds_min=("TurnoutTimeSeconds", "min"),
            TurnoutTimeSeconds_mean=("TurnoutTimeSeconds", "median"),
            TurnoutTimeSeconds_max=("TurnoutTimeSeconds", "max"),
            TravelTimeSeconds_min=("TravelTimeSeconds", "min"),
            TravelTimeSeconds_mean=("TravelTimeSeconds", "median"),
            TravelTimeSeconds_max=("TravelTimeSeconds", "max"),
        )
        .reset_index()
    )
    display(aggregated.head())
    # jointure de l'agrégat sur le dataframe pour ajouter les colonnes de temps
    mobilisations_df = pd.merge(
        mobilisations_df, aggregated, left_on="IncidentNumber", right_on="IncidentNumber", how="left"
    ).sort_values(by=["IncidentNumber", "PumpOrder"])
    display(mobilisations_df.head(10))
    print(mobilisations_df.shape)
    return mobilisations_df

def preprocess_mobilisations_duplicates(mobilisations_df):
    # suppression des duplicatas pour ne garder qu'une ligne par incidents
    mobilisations_df = mobilisations_df.drop_duplicates(subset=["IncidentNumber"], keep="first")
    mobilisations_df.IncidentNumber.duplicated().sum()
    return mobilisations_df

def preprocess_mobilisations_drop_cols(mobilisations_df):
    # suppression des données inutiles, en doublon métier, ou non présente au moment de la prédiction
    mobilisations_df = mobilisations_df.drop(
        [
            "CalYear",
            "HourOfCall",
            "ResourceMobilisationId",
            "Resource_Code",
            "PerformanceReporting",
            "DateAndTimeMobilised",
            "DateAndTimeMobile",
            "DateAndTimeArrived",
            "TurnoutTimeSeconds",
            "TravelTimeSeconds",
            "PumpOnSiteSeconds",
            "AttendanceTimeSeconds",
            "DateAndTimeLeft",
            "DateAndTimeReturned",
            "DeployedFromStation_Code",
            "DeployedFromStation_Name",
            "DeployedFromLocation",
            "PumpOrder",
            "PlusCode_Code",
            "PlusCode_Description",
            "DelayCodeId",
            "DelayCode_Description",
        ],
        axis=1,
    )
    return mobilisations_df

In [12]:
mobilisations_df = load_mobilisations()
mobilisations_df = preprocess_mobilisations_types(mobilisations_df)
mobilisations_df = preprocess_mobilisations_times(mobilisations_df)
mobilisations_df = preprocess_mobilisations_duplicates(mobilisations_df)
mobilisations_df = preprocess_mobilisations_drop_cols(mobilisations_df)

# sauvegarde d'un fichier temporaire
mobilisations_df.to_csv("../data/PreMobilisations.csv", sep=";", index=False)

Unnamed: 0,IncidentNumber,CalYear,HourOfCall,ResourceMobilisationId,Resource_Code,PerformanceReporting,DateAndTimeMobilised,DateAndTimeMobile,DateAndTimeArrived,TurnoutTimeSeconds,TravelTimeSeconds,AttendanceTimeSeconds,DateAndTimeLeft,DateAndTimeReturned,DeployedFromStation_Code,DeployedFromStation_Name,DeployedFromLocation,PumpOrder,PlusCode_Code,PlusCode_Description,DelayCodeId,DelayCode_Description
0,235138081,2009,0,38426,H271,1,2009-01-01 00:02:00,NaT,2009-01-01 00:07:00,,,319,2009-01-01 00:16:00,2009-01-01 00:23:00,H27,Battersea,Home Station,1,Initial,Initial Mobilisation,,
1,235138081,2009,0,38427,H212,2,2009-01-01 00:02:00,2009-01-01 00:06:00,2009-01-01 00:08:00,253.0,89.0,342,2009-01-01 00:12:00,2009-01-01 00:12:00,H21,Clapham,Home Station,2,Initial,Initial Mobilisation,,
2,2091,2009,0,38429,A341,1,2009-01-01 00:04:00,2009-01-01 00:06:00,2009-01-01 00:09:00,151.0,157.0,308,2009-01-01 00:16:00,2009-01-01 00:17:00,A34,Edmonton,Home Station,1,Initial,Initial Mobilisation,,
3,3091,2009,0,38430,G232,1,2009-01-01 00:04:00,2009-01-01 00:06:00,2009-01-01 00:08:00,108.0,102.0,210,2009-01-01 00:20:00,2009-01-01 00:20:00,G23,Hillingdon,Home Station,1,Initial,Initial Mobilisation,,
4,5091,2009,0,38432,A311,1,2009-01-01 00:06:00,2009-01-01 00:07:00,2009-01-01 00:09:00,114.0,119.0,233,2009-01-01 00:11:00,2009-01-01 00:22:00,A31,Holloway,Home Station,1,Initial,Initial Mobilisation,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2480004 entries, 0 to 2480003
Data columns (total 22 columns):
 #   Column                    Dtype         
---  ------                    -----         
 0   IncidentNumber            object        
 1   CalYear                   int64         
 2   HourOfCall                int64         
 3   ResourceMobilisationId    int64         
 4   Resource_Code             object        
 5   PerformanceReporting      object        
 6   DateAndTimeMobilised      datetime64[ns]
 7   DateAndTimeMobile         datetime64[ns]
 8   DateAndTimeArrived        datetime64[ns]
 9   TurnoutTimeSeconds        float64       
 10  TravelTimeSeconds         float64       
 11  AttendanceTimeSeconds     int64         
 12  DateAndTimeLeft           datetime64[ns]
 13  DateAndTimeReturned       datetime64[ns]
 14  DeployedFromStation_Code  object        
 15  DeployedFromStation_Name  object        
 16  DeployedFromLocation      object        
 17  PumpOrde

None

Unnamed: 0,IncidentNumber,CalYear,HourOfCall,ResourceMobilisationId,Resource_Code,PerformanceReporting,DateAndTimeMobilised,DateAndTimeMobile,DateAndTimeArrived,TurnoutTimeSeconds,TravelTimeSeconds,AttendanceTimeSeconds,DateAndTimeLeft,DateAndTimeReturned,DeployedFromStation_Code,DeployedFromStation_Name,DeployedFromLocation,PumpOrder,PlusCode_Code,PlusCode_Description,DelayCodeId,DelayCode_Description,PumpOnSiteSeconds
0,235138081,2009,0,38426,H271,1,2009-01-01 00:02:00,NaT,2009-01-01 00:07:00,,,319,2009-01-01 00:16:00,2009-01-01 00:23:00,H27,Battersea,Home Station,1,Initial,Initial Mobilisation,,,540.0
1,235138081,2009,0,38427,H212,2,2009-01-01 00:02:00,2009-01-01 00:06:00,2009-01-01 00:08:00,253.0,89.0,342,2009-01-01 00:12:00,2009-01-01 00:12:00,H21,Clapham,Home Station,2,Initial,Initial Mobilisation,,,240.0
2,2091,2009,0,38429,A341,1,2009-01-01 00:04:00,2009-01-01 00:06:00,2009-01-01 00:09:00,151.0,157.0,308,2009-01-01 00:16:00,2009-01-01 00:17:00,A34,Edmonton,Home Station,1,Initial,Initial Mobilisation,,,420.0


Unnamed: 0,IncidentNumber,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max
0,1092,,,,128.0,128.0,128.0,390.0,390.0,390.0
1,1111,120.0,120.0,120.0,71.0,71.0,71.0,480.0,480.0,480.0
2,1112,480.0,480.0,480.0,119.0,119.0,119.0,225.0,225.0,225.0
3,1121,240.0,300.0,360.0,65.0,66.0,67.0,192.0,213.5,235.0
4,1131,360.0,360.0,360.0,89.0,89.0,89.0,78.0,78.0,78.0


Unnamed: 0,IncidentNumber,CalYear,HourOfCall,ResourceMobilisationId,Resource_Code,PerformanceReporting,DateAndTimeMobilised,DateAndTimeMobile,DateAndTimeArrived,TurnoutTimeSeconds,TravelTimeSeconds,AttendanceTimeSeconds,DateAndTimeLeft,DateAndTimeReturned,DeployedFromStation_Code,DeployedFromStation_Name,DeployedFromLocation,PumpOrder,PlusCode_Code,PlusCode_Description,DelayCodeId,DelayCode_Description,PumpOnSiteSeconds,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max
25229,1092,2009,10,72977,G221,1,2009-02-23 10:02:00,2009-02-23 10:04:00,2009-02-23 10:11:00,128.0,390.0,518,NaT,2009-02-23 10:22:00,G22,Stanmore,Home Station,1,Initial,Initial Mobilisation,12.0,Not held up,,,,,128.0,128.0,128.0,390.0,390.0,390.0
338999,1111,2011,0,3610263,A411,1,2011-01-01 00:03:00,2011-01-01 00:04:00,2011-01-01 00:12:00,71.0,480.0,551,2011-01-01 00:14:00,2011-01-01 00:20:00,A41,West Hampstead,Home Station,1,Initial,Initial Mobilisation,12.0,Not held up,120.0,120.0,120.0,120.0,71.0,71.0,71.0,480.0,480.0,480.0
346998,1112,2011,1,3621855,E331,1,2011-01-22 01:32:00,2011-01-22 01:34:00,2011-01-22 01:38:00,119.0,225.0,344,2011-01-22 01:46:00,2011-01-22 01:47:00,E33,Southwark,Home Station,1,Initial,Initial Mobilisation,,,480.0,480.0,480.0,480.0,119.0,119.0,119.0,225.0,225.0,225.0
492154,1121,2012,0,3831037,G401,1,2012-01-01 00:01:00,2012-01-01 00:02:00,2012-01-01 00:05:00,65.0,192.0,257,2012-01-01 00:11:00,2012-01-01 00:18:00,G40,Hayes,Home Station,1,Initial,Initial Mobilisation,,,360.0,240.0,300.0,360.0,65.0,66.0,67.0,192.0,213.5,235.0
492155,1121,2012,0,3831038,G402,2,2012-01-01 00:01:00,2012-01-01 00:02:00,2012-01-01 00:06:00,67.0,235.0,302,2012-01-01 00:10:00,2012-01-01 00:17:00,G40,Hayes,Home Station,2,Initial,Initial Mobilisation,,,240.0,240.0,300.0,360.0,65.0,66.0,67.0,192.0,213.5,235.0
637588,1131,2013,0,4036113,A412,1,2013-01-01 00:02:00,2013-01-01 00:03:00,2013-01-01 00:05:00,89.0,78.0,167,2013-01-01 00:11:00,2013-01-01 00:12:00,A41,West Hampstead,Home Station,1,Initial,Initial Mobilisation,,,360.0,360.0,360.0,360.0,89.0,89.0,89.0,78.0,78.0,78.0
733568,1133,2013,12,4179294,G281,1,2013-09-10 11:06:00,2013-09-10 11:06:00,2013-09-10 11:11:00,1.0,267.0,268,2013-09-10 11:14:00,2013-09-10 11:14:00,G28,Willesden,Home Station,1,Initial,Initial Mobilisation,,,180.0,180.0,180.0,180.0,1.0,1.0,1.0,267.0,267.0,267.0
880681,1143,2014,11,4407663,H431,1,2014-10-28 11:57:00,2014-10-28 11:58:00,2014-10-28 12:04:00,42.0,377.0,419,2014-10-28 12:08:00,NaT,H43,Twickenham,Home Station,1,Initial,Initial Mobilisation,9.0,"Traffic, roadworks, etc",240.0,240.0,240.0,240.0,42.0,42.0,42.0,377.0,377.0,377.0
2,2091,2009,0,38429,A341,1,2009-01-01 00:04:00,2009-01-01 00:06:00,2009-01-01 00:09:00,151.0,157.0,308,2009-01-01 00:16:00,2009-01-01 00:17:00,A34,Edmonton,Home Station,1,Initial,Initial Mobilisation,,,420.0,420.0,420.0,420.0,151.0,151.0,151.0,157.0,157.0,157.0
25230,2092,2009,10,72978,E241,1,2009-02-23 10:03:00,2009-02-23 10:04:00,2009-02-23 10:06:00,85.0,85.0,170,NaT,2009-02-23 10:17:00,E24,Woolwich,Home Station,1,Initial,Initial Mobilisation,,,,300.0,300.0,300.0,78.0,81.5,85.0,85.0,208.0,331.0


(2480004, 32)


In [13]:
# ------------------------- Merge Incidents et Mobilisations -----------------------------

def load_mobilisations_and_incidents_and_merge():
    # Charge les dataset temporaire
    df_incidents = pd.read_csv("../data/PreIncidents.csv", sep=";", low_memory=False)
    df_mobilisations = pd.read_csv(
        "../data/PreMobilisations.csv", sep=";", low_memory=False
    )
    display(df_incidents.head())
    display(df_incidents.info())
    display(df_mobilisations.head())
    display(df_mobilisations.info())
    # Merge
    df = pd.merge(
        df_incidents,
        df_mobilisations,
        left_on="IncidentNumber",
        right_on="IncidentNumber",
        how="left",
    )
    df = df.reset_index(drop=True)
    display(df.head())
    display(df.info())
    return df

# Gestion des NA
def preprocess_nan(df):
    # supprime les lignes avec très peu de NA
    df = df.dropna(axis=0, how="any", subset=["PropertyType", "NumPumpsAttending"])
    # Il n'existe pas toujours de ligne d'intervention pour les lignes d'Incidents, on supprime les lignes 
    # sans informations de mobilisation, et parfois l'information est partiellement saisie
    df = df.dropna(
        axis=0,
        how="any",
        subset=[
            "PumpSecondsOnSite_min",
            "TurnoutTimeSeconds_min",
            "TravelTimeSeconds_min",
        ],
    )
    return df

In [14]:
df = load_mobilisations_and_incidents_and_merge()
df = preprocess_nan(df)

# sauvegarde d'un fichier temporaire
df.to_csv("../data/PreProcess.csv", sep=";", index=False)

Unnamed: 0,IncidentNumber,CalYear,HourOfCall,PropertyType,Postcode_district,NumPumpsAttending,StopCode,Month,DayOfWeek
0,235138081,2009,0,CAR,SW11,2.0,SST-RTC,1,4
1,1091,2009,0,LAKE/POND/RESERVOIR,SE1,,SST-ASSIST OTHER AGENCIES,1,4
2,2091,2009,0,ROAD SURFACE/PAVEMENT,N9,1.0,SECONDARY FIRE,1,4
3,3091,2009,0,DOMESTIC GARDEN (VEGETATION NOT EQUIPMENT),UB10,1.0,SECONDARY FIRE,1,4
4,5091,2009,0,CYCLE PATH/PUBLIC FOOTPATH/BRIDLEWAY,N7,2.0,SECONDARY FIRE,1,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771413 entries, 0 to 1771412
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   IncidentNumber     object 
 1   CalYear            int64  
 2   HourOfCall         int64  
 3   PropertyType       object 
 4   Postcode_district  object 
 5   NumPumpsAttending  float64
 6   StopCode           object 
 7   Month              int64  
 8   DayOfWeek          int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 121.6+ MB


None

Unnamed: 0,IncidentNumber,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max
0,1092,,,,128.0,128.0,128.0,390.0,390.0,390.0
1,1111,120.0,120.0,120.0,71.0,71.0,71.0,480.0,480.0,480.0
2,1112,480.0,480.0,480.0,119.0,119.0,119.0,225.0,225.0,225.0
3,1121,240.0,300.0,360.0,65.0,66.0,67.0,192.0,213.5,235.0
4,1131,360.0,360.0,360.0,89.0,89.0,89.0,78.0,78.0,78.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1642042 entries, 0 to 1642041
Data columns (total 10 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   IncidentNumber           1642042 non-null  object 
 1   PumpSecondsOnSite_min    1612710 non-null  float64
 2   PumpSecondsOnSite_mean   1612710 non-null  float64
 3   PumpSecondsOnSite_max    1612710 non-null  float64
 4   TurnoutTimeSeconds_min   1624689 non-null  float64
 5   TurnoutTimeSeconds_mean  1624689 non-null  float64
 6   TurnoutTimeSeconds_max   1624689 non-null  float64
 7   TravelTimeSeconds_min    1624692 non-null  float64
 8   TravelTimeSeconds_mean   1624692 non-null  float64
 9   TravelTimeSeconds_max    1624692 non-null  float64
dtypes: float64(9), object(1)
memory usage: 125.3+ MB


None

Unnamed: 0,IncidentNumber,CalYear,HourOfCall,PropertyType,Postcode_district,NumPumpsAttending,StopCode,Month,DayOfWeek,PumpSecondsOnSite_min,PumpSecondsOnSite_mean,PumpSecondsOnSite_max,TurnoutTimeSeconds_min,TurnoutTimeSeconds_mean,TurnoutTimeSeconds_max,TravelTimeSeconds_min,TravelTimeSeconds_mean,TravelTimeSeconds_max
0,235138081,2009,0,CAR,SW11,2.0,SST-RTC,1,4,240.0,390.0,540.0,253.0,253.0,253.0,89.0,89.0,89.0
1,1091,2009,0,LAKE/POND/RESERVOIR,SE1,,SST-ASSIST OTHER AGENCIES,1,4,,,,,,,,,
2,2091,2009,0,ROAD SURFACE/PAVEMENT,N9,1.0,SECONDARY FIRE,1,4,420.0,420.0,420.0,151.0,151.0,151.0,157.0,157.0,157.0
3,3091,2009,0,DOMESTIC GARDEN (VEGETATION NOT EQUIPMENT),UB10,1.0,SECONDARY FIRE,1,4,720.0,720.0,720.0,108.0,108.0,108.0,102.0,102.0,102.0
4,5091,2009,0,CYCLE PATH/PUBLIC FOOTPATH/BRIDLEWAY,N7,2.0,SECONDARY FIRE,1,4,120.0,120.0,120.0,114.0,128.0,142.0,108.0,113.5,119.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771413 entries, 0 to 1771412
Data columns (total 18 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   IncidentNumber           object 
 1   CalYear                  int64  
 2   HourOfCall               int64  
 3   PropertyType             object 
 4   Postcode_district        object 
 5   NumPumpsAttending        float64
 6   StopCode                 object 
 7   Month                    int64  
 8   DayOfWeek                int64  
 9   PumpSecondsOnSite_min    float64
 10  PumpSecondsOnSite_mean   float64
 11  PumpSecondsOnSite_max    float64
 12  TurnoutTimeSeconds_min   float64
 13  TurnoutTimeSeconds_mean  float64
 14  TurnoutTimeSeconds_max   float64
 15  TravelTimeSeconds_min    float64
 16  TravelTimeSeconds_mean   float64
 17  TravelTimeSeconds_max    float64
dtypes: float64(10), int64(4), object(4)
memory usage: 243.3+ MB


None

In [None]:
import pandas as pd

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

df = pd.read_csv("../data/PreProcess.csv", sep=";", low_memory=False)

postcode_district = pd.DataFrame(df["Postcode_district"].unique()).sort_values(by=0)
display(postcode_district)
postcode_district.to_csv("../data/postcode_district.csv", sep=";", index=False)

property_type = pd.DataFrame(df["PropertyType"].unique()).sort_values(by=0)
display(property_type)
property_type.to_csv("../data/property_type.csv", sep=";", index=False)

stop_code = pd.DataFrame(df["StopCode"].unique()).sort_values(by=0)
display(stop_code)
stop_code.to_csv("../data/stop_code.csv", sep=";", index=False)