In [2]:
# -----------------------------Exploration, mise en forme et concaténation des fichiers incidents 

# Importation de la bibliothèque Pandas pour la manipulation des données
import pandas as pd

# Ajuster les paramètres pour afficher toutes les lignes et colonnes
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# Charger les données des incidents de la London Fire Brigade (LFB)
df1 = pd.read_csv(
    "../data/LFB Incident data from 2009 - 2017.csv", sep=",", low_memory=False
)
df2 = pd.read_csv(
    "../data/LFB Incident data from 2018 onwards.csv", sep=",", low_memory=False
)

In [3]:
# Résumé des informations sur chaque DataFrame 
display(df1.info())
display(df2.info())

# Afficher la liste des colonnes de chaque DataFrame
print(df2.columns)
print(df2.columns)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988279 entries, 0 to 988278
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   IncidentNumber                          988279 non-null  object 
 1   DateOfCall                              988279 non-null  object 
 2   CalYear                                 988279 non-null  int64  
 3   TimeOfCall                              988279 non-null  object 
 4   HourOfCall                              988279 non-null  int64  
 5   IncidentGroup                           988279 non-null  object 
 6   StopCodeDescription                     988279 non-null  object 
 7   SpecialServiceType                      299101 non-null  object 
 8   PropertyCategory                        988279 non-null  object 
 9   PropertyType                            988279 non-null  object 
 10  AddressQualifier                        9882

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 783134 entries, 0 to 783133
Data columns (total 39 columns):
 #   Column                                  Non-Null Count   Dtype  
---  ------                                  --------------   -----  
 0   IncidentNumber                          783134 non-null  object 
 1   DateOfCall                              783134 non-null  object 
 2   CalYear                                 783134 non-null  int64  
 3   TimeOfCall                              783134 non-null  object 
 4   HourOfCall                              783134 non-null  int64  
 5   IncidentGroup                           783128 non-null  object 
 6   StopCodeDescription                     783134 non-null  object 
 7   SpecialServiceType                      276450 non-null  object 
 8   PropertyCategory                        783128 non-null  object 
 9   PropertyType                            783128 non-null  object 
 10  AddressQualifier                        7831

None

Index(['IncidentNumber', 'DateOfCall', 'CalYear', 'TimeOfCall', 'HourOfCall',
       'IncidentGroup', 'StopCodeDescription', 'SpecialServiceType',
       'PropertyCategory', 'PropertyType', 'AddressQualifier', 'Postcode_full',
       'Postcode_district', 'UPRN', 'USRN', 'IncGeo_BoroughCode',
       'IncGeo_BoroughName', 'ProperCase', 'IncGeo_WardCode',
       'IncGeo_WardName', 'IncGeo_WardNameNew', 'Easting_m', 'Northing_m',
       'Easting_rounded', 'Northing_rounded', 'Latitude', 'Longitude', 'FRS',
       'IncidentStationGround', 'FirstPumpArriving_AttendanceTime',
       'FirstPumpArriving_DeployedFromStation',
       'SecondPumpArriving_AttendanceTime',
       'SecondPumpArriving_DeployedFromStation',
       'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'PumpCount',
       'PumpMinutesRounded', 'Notional Cost (£)', 'NumCalls'],
      dtype='object')
Index(['IncidentNumber', 'DateOfCall', 'CalYear', 'TimeOfCall', 'HourOfCall',
       'IncidentGroup', 'StopCodeDescription',

In [4]:
# Fusionner les deux DataFrames en les empilant verticalement
df = pd.concat([df1, df2], axis=0)

# Réinitialiser l'index après la fusion
df = df.reset_index(drop=True)

# Afficher la taille du DataFrame et un aperçu des premières lignes
print(df.shape)
display(df.head(2))

# Nettoyage de la colonne "IncidentNumber" pour supprimer les valeurs ".00"
df["IncidentNumber"] = df["IncidentNumber"].apply(
    lambda x: str(x)[0:-3] if (".00" in x) & (str(x)[-3:] == ".00") else x
)

# Convertir les colonnes de date et d'heure en format datetime
df.DateOfCall = pd.to_datetime(df.DateOfCall, format="%d-%b-%y")
df.TimeOfCall = pd.to_datetime(df.TimeOfCall, format="%H:%M:%S")

# Extraire des informations temporelles supplémentaires
df["Month"] = df["DateOfCall"].dt.month
df["DayOfWeek"] = df["DateOfCall"].dt.dayofweek
df["DayName"] = df["DateOfCall"].dt.day_name()

# Afficher un aperçu des données après transformation
display(df.head())
# Afficher les informations générales du DataFrame (types de données, valeurs manquantes, etc.)
display(df.info())
# Afficher des statistiques descriptives sur les colonnes numériques
display(df.describe())

(1771413, 39)


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls
0,235138081.0,01-Jan-09,2009,00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to gazetteer location,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652.0,176830.0,528650,176850,51.475812,-0.148894,London,Battersea,319.0,Battersea,342.0,Clapham,2.0,2.0,2,60,255,1.0
1,1091.0,01-Jan-09,2009,00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest gazetteer location,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485.0,179007.0,530450,179050,51.494957,-0.121712,London,Lambeth,,,,,,,1,60,255,1.0


Unnamed: 0,IncidentNumber,DateOfCall,CalYear,TimeOfCall,HourOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,UPRN,USRN,IncGeo_BoroughCode,IncGeo_BoroughName,ProperCase,IncGeo_WardCode,IncGeo_WardName,IncGeo_WardNameNew,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls,Month,DayOfWeek,DayName
0,235138081,2009-01-01,2009,1900-01-01 00:00:37,0,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to gazetteer location,SW11 4LB,SW11,,,E09000032,WANDSWORTH,Wandsworth,E05014010,Battersea Park,Battersea Park,528652.0,176830.0,528650,176850,51.475812,-0.148894,London,Battersea,319.0,Battersea,342.0,Clapham,2.0,2.0,2,60,255,1.0,1,3,Thursday
1,1091,2009-01-01,2009,1900-01-01 00:00:46,0,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest gazetteer location,SE1 7SG,SE1,,,E09000022,LAMBETH,Lambeth,E05014118,Waterloo & South Bank,Waterloo & South Bank,530485.0,179007.0,530450,179050,51.494957,-0.121712,London,Lambeth,,,,,,,1,60,255,1.0,1,3,Thursday
2,2091,2009-01-01,2009,1900-01-01 00:03:00,0,Fire,Secondary Fire,,Outdoor,Road surface/pavement,In street outside gazetteer location,N9 9EL,N9,,,E09000010,ENFIELD,Enfield,E05013682,Haselbury,Haselbury,533773.0,194492.0,533750,194450,51.633342,-0.068488,London,Edmonton,308.0,Edmonton,,,1.0,1.0,1,60,255,2.0,1,3,Thursday
3,3091,2009-01-01,2009,1900-01-01 00:04:27,0,Fire,Secondary Fire,,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,UB10 0DG,UB10,100021500000.0,21401491.0,E09000017,HILLINGDON,Hillingdon,E05013571,Hillingdon East,Hillingdon East,507738.0,182805.0,507750,182850,51.533882,-0.448089,London,Hillingdon,210.0,Hillingdon,,,1.0,1.0,1,60,255,2.0,1,3,Thursday
4,5091,2009-01-01,2009,1900-01-01 00:05:39,0,Fire,Secondary Fire,,Outdoor,Cycle path/public footpath/bridleway,In street outside gazetteer location,N7 8HG,N7,,,E09000019,ISLINGTON,Islington,E05013708,Laycock,Laycock,531058.0,185307.0,531050,185350,51.551441,-0.11112,London,Holloway,233.0,Holloway,250.0,Holloway,1.0,2.0,2,60,255,1.0,1,3,Thursday


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1771413 entries, 0 to 1771412
Data columns (total 42 columns):
 #   Column                                  Dtype         
---  ------                                  -----         
 0   IncidentNumber                          object        
 1   DateOfCall                              datetime64[ns]
 2   CalYear                                 int64         
 3   TimeOfCall                              datetime64[ns]
 4   HourOfCall                              int64         
 5   IncidentGroup                           object        
 6   StopCodeDescription                     object        
 7   SpecialServiceType                      object        
 8   PropertyCategory                        object        
 9   PropertyType                            object        
 10  AddressQualifier                        object        
 11  Postcode_full                           object        
 12  Postcode_district                       ob

None

Unnamed: 0,DateOfCall,CalYear,TimeOfCall,HourOfCall,UPRN,USRN,Easting_m,Northing_m,Easting_rounded,Northing_rounded,Latitude,Longitude,FirstPumpArriving_AttendanceTime,SecondPumpArriving_AttendanceTime,NumStationsWithPumpsAttending,NumPumpsAttending,PumpCount,PumpMinutesRounded,Notional Cost (£),NumCalls,Month,DayOfWeek
count,1771413,1771413.0,1771413,1771413.0,1630080.0,1608542.0,886544.0,886544.0,1771413.0,1771413.0,886544.0,886544.0,1635724.0,633568.0,1757310.0,1757310.0,1771413.0,1771413.0,1771413.0,1769564.0,1771413.0,1771413.0
mean,2016-12-16 06:27:20.863649024,2016.456,1900-01-01 13:58:01.595983360,13.46887,21928420000.0,20313550.0,530632.668589,180365.050822,530659.8,180488.3,51.445867,-0.118837,318.2711,395.38522,1.369216,1.554288,1.591798,76.29298,403.9678,1.329551,6.561727,3.020437
min,2009-01-01 00:00:00,2009.0,1900-01-01 00:00:00,0.0,0.0,0.0,503582.0,155901.0,503550.0,155950.0,0.0,-0.510155,1.0,1.0,1.0,1.0,1.0,60.0,255.0,1.0,1.0,0.0
25%,2012-08-17 00:00:00,2012.0,1900-01-01 09:51:06,9.0,0.0,20400960.0,525185.0,175989.0,525250.0,176150.0,51.468944,-0.197424,231.0,295.0,1.0,1.0,1.0,60.0,260.0,1.0,4.0,1.0
50%,2017-01-04 00:00:00,2017.0,1900-01-01 14:45:16,14.0,0.0,21201270.0,530797.0,180982.0,530950.0,181050.0,51.512881,-0.116981,296.0,369.0,1.0,1.0,1.0,60.0,328.0,1.0,7.0,3.0
75%,2021-04-30 00:00:00,2021.0,1900-01-01 19:02:12,19.0,10013190000.0,22101160.0,536834.0,184939.0,536250.0,185150.0,51.547757,-0.028701,378.0,464.0,2.0,2.0,2.0,60.0,364.0,1.0,9.0,5.0
max,2024-10-31 00:00:00,2024.0,1900-01-01 23:59:59,23.0,200004400000.0,99990420.0,561126.0,200906.0,611150.0,302450.0,51.69167,0.322219,1200.0,1200.0,46.0,106.0,951.0,525629.0,2277726.0,369.0,12.0,6.0
std,,4.776673,,6.252623,47532280000.0,4378135.0,10336.679483,7434.83114,9725.543,7357.29,1.77533,0.148994,138.6791,153.33425,0.6836639,0.8418938,1.586884,567.8902,2710.555,1.488664,3.361091,1.997067


In [5]:
# Analyser les valeurs manquantes dans le DataFrame
# - Calculer le nombre total de valeurs manquantes par colonne
# - Déterminer le pourcentage de valeurs manquantes pour chaque colonne
# - Stocker ces informations dans un DataFrame pour une meilleure visualisation

all_na = df.isna().sum()
lignes = df.shape[0]
list = {}
list["colonne"] = []
list["na"] = []
list["pourcentage"] = []
for colonne, na in zip(all_na.index, all_na.values):
    pourcentage = round(na / lignes * 100, 4)
    list["colonne"].append(colonne)
    list["na"].append(na)
    list["pourcentage"].append(pourcentage)
    #print(f"{na}, {pourcentage}%")
a = pd.DataFrame(list)
display(a)

Unnamed: 0,colonne,na,pourcentage
0,IncidentNumber,0,0.0
1,DateOfCall,0,0.0
2,CalYear,0,0.0
3,TimeOfCall,0,0.0
4,HourOfCall,0,0.0
5,IncidentGroup,6,0.0003
6,StopCodeDescription,0,0.0
7,SpecialServiceType,1195862,67.5089
8,PropertyCategory,6,0.0003
9,PropertyType,6,0.0003


In [6]:
# Identifier les colonnes ayant un nombre de valeurs uniques inférieur à 15
for col in df.columns:
    uniques = df[col].unique()
    if len(uniques) < 15:
        display(col, uniques)

'IncidentGroup'

array(['Special Service', 'Fire', 'False Alarm', nan], dtype=object)

'StopCodeDescription'

array(['Special Service', 'Secondary Fire', 'AFA', 'Primary Fire',
       'False alarm - Good intent', 'False alarm - Malicious',
       'Chimney Fire', 'Flood call attended - Batch mobilised',
       'Late Call', 'Use of Special Operations Room', 'Standby'],
      dtype=object)

'PropertyCategory'

array(['Road Vehicle', 'Outdoor', 'Dwelling', 'Outdoor Structure',
       'Other Residential', 'Non Residential', 'Aircraft', 'Rail Vehicle',
       'Boat', nan], dtype=object)

'AddressQualifier'

array(['In street close to gazetteer location',
       'Open land/water - nearest gazetteer location',
       'In street outside gazetteer location',
       'On land associated with building', 'Correct incident location',
       'On motorway / elevated road',
       'In street remote from gazetteer location', 'Within same building',
       'Nearby address - street not listed in gazetteer',
       'Nearby address - no building in street',
       'Railway land or rolling stock', nan], dtype=object)

'FRS'

array(['London'], dtype=object)

'Month'

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)

'DayOfWeek'

array([3, 4, 5, 6, 0, 1, 2], dtype=int32)

'DayName'

array(['Thursday', 'Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday',
       'Wednesday'], dtype=object)

In [7]:
# Sauvegarder le DataFrame dans un fichier CSV

df.to_csv("../data/Incidents.csv", sep=";")