In [131]:
import os
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings('ignore')
folder_path = "./"
csv_attendant_files = [file for file in os.listdir(folder_path) if file.endswith('.csv') and 'attendant' in file]
csv_casualty_files = [file for file in os.listdir(folder_path) if file.endswith('.csv') and 'casualty' in file]

attendant_col_mapping = {
               'AREFNO':'collision_index',
               'Accident Ref.':'collision_index',
               'Accident Ref':'collision_index',
               'Borough':'Borough Name',
               'Boro':'Borough Number',
               'Location':'Collision Location',
               'Accident Severity':'legacy_collision_severity',
               'No. of Casualties in Acc.':'number_of_casualties',
               'No. of Vehicles in Acc.':'number_of_vehicles',
               'Accident Date':'date',
               'Day':'day_of_week',
               'Weather':'weather_conditions',
               'Road Surface':'road_surface_conditions',
               'Light Conditions (Banded)':'light_conditions',
               'Time':'time'}

casualty_col_mapping = {
                'AREFNO': 'collision_index',
                'Accident Ref.':'collision_index',
                'Accident Ref':'collision_index',
                'Borough': 'Borough Name',
                'Boro': 'Borough Number',
                'CREFNO': 'casualty_reference',
                'Casualty Class': 'casualty_class',
                'Casualty Sex': 'sex_of_casualty',
                'Casualty Age (Banded)': 'age_band_of_casualty',
                'Casualty Age': 'age_of_casualty',
                'No. of Casualties': 'number_of_casualties',
                'Casualty Severity': 'casualty_severity'}

pattern = re.compile(r'2013|2014|2015|2016|2017')
attendant_df = []
for file_name in csv_attendant_files:
    if re.search(pattern, file_name):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path,encoding='ISO-8859-1')
        df.rename(columns=attendant_col_mapping,inplace=True)
        attendant_df.append(df)
final_attendant_df = pd.concat(attendant_df,axis=0)

casualty_df = []
for file_name in csv_casualty_files:
    if re.search(pattern, file_name):
        file_path = os.path.join(folder_path, file_name)
        df = pd.read_csv(file_path,encoding='ISO-8859-1')
        df.rename(columns=casualty_col_mapping,inplace=True)
        casualty_df.append(df)
final_casualty_df = pd.concat(casualty_df,axis=0).drop(columns=[' '])


In [132]:
provided_places = {
    "WESTMINSTER": "Westminster",
    "LAMBETH": "Lambeth",
    "TOWER HAMLETS": "Tower Hamlets",
    "BARNET": "Barnet",
    "EALING": "Ealing",
    "SOUTHWARK": "Southwark",
    "WANDSWORTH": "Wandsworth",
    "CROYDON": "Croydon",
    "BRENT": "Brent",
    "CAMDEN": "Camden",
    "LEWISHAM": "Lewisham",
    "HACKNEY": "Hackney",
    "HARINGEY": "Haringey",
    "HOUNSLOW": "Hounslow",
    "NEWHAM": "Newham",
    "ISLINGTON": "Islington",
    "ENFIELD": "Enfield",
    "BROMLEY": "Bromley",
    "REDBRIDGE": "Redbridge",
    "KENSINGTON & CHELSEA": "Kensington and Chelsea",
    "HILLINGDON": "Hillingdon",
    "WALTHAM FOREST": "Waltham Forest",
    "GREENWICH": "Greenwich",
    "HAMMERSMITH & FULHAM": "Hammersmith and Fulham",
    "HAVERING": "Havering",
    "BARKING & DAGENHAM": "Barking and Dagenham",
    "MERTON": "Merton",
    "RICHMOND-UPON-THAMES": "Richmond upon Thames",
    "BEXLEY": "Bexley",
    "HARROW": "Harrow",
    "SUTTON": "Sutton",
    "KINGSTON-UPON-THAMES": "Kingston upon Thames",
    "CITY OF LONDON": "City of London"
}

old_mapping = {
    "E09000001": "City of London",
    "E09000002": "Barking and Dagenham",
    "E09000003": "Barnet",
    "E09000004": "Bexley",
    "E09000006": "Bromley",
    "E09000005": "Brent",
    "E09000007": "Camden",
    "E09000008": "Croydon",
    "E09000009": "Ealing",
    "E09000010": "Enfield",
    "E09000011": "Greenwich",
    "E09000012": "Hackney",
    "E09000013": "Hammersmith and Fulham",
    "E09000014": "Haringey",
    "E09000016": "Havering",
    "E09000017": "Hillingdon",
    "E09000015": "Harrow",
    "E09000018": "Hounslow",
    "E09000020": "Kensington and Chelsea",
    "E09000019": "Islington",
    "E09000022": "Lambeth",
    "E09000021": "Kingston upon Thames",
    "E09000023": "Lewisham",
    "E09000024": "Merton",
    "E09000025": "Newham",
    "E09000026": "Redbridge",
    "E09000027": "Richmond upon Thames",
    "E09000028": "Southwark",
    "E09000029": "Sutton",
    "E09000030": "Tower Hamlets",
    "E09000031": "Waltham Forest",
    "E09000032": "Wandsworth",
    "E08000001": "Bolton",
    "E09000033": "Westminster"
}
exchanged_mapping = {v: k for k, v in old_mapping.items()}

final_attendant_df['Borough Name'] = final_attendant_df['Borough Name'].apply(lambda x : provided_places[x])
final_attendant_df['local_authority_ons_district'] = final_attendant_df['Borough Name'].apply(lambda x : exchanged_mapping[x] )

In [133]:
final_attendant_df['date'] = pd.to_datetime(final_attendant_df['date'])
final_attendant_df['collision_year'] = final_attendant_df['date'].dt.year
final_attendant_df['date'] = final_attendant_df['date'].apply(lambda x : x.strftime('%d/%m/%Y'))

In [134]:
casualty_df = final_casualty_df[['collision_index','casualty_reference','age_band_of_casualty','casualty_class','sex_of_casualty','age_of_casualty','casualty_severity']]

In [135]:
final_attendant_df.columns

Index(['collision_index', 'Borough Name', 'Borough Number', 'Easting',
       'Northing', 'Collision Location', 'legacy_collision_severity',
       'number_of_casualties', 'number_of_vehicles', 'date', 'day_of_week',
       'time', 'Highway', 'Road Class 1', 'Road No. 1', 'Road Type',
       'Speed Limit', 'Junction Detail', 'Junction Control', 'Road Class 2',
       'Road No. 2', 'Ped. Crossing Decoded', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'Special Conditions',
       'C/W Hazard', 'local_authority_ons_district', 'collision_year'],
      dtype='object')

In [136]:
attendant_df = final_attendant_df[['collision_index','legacy_collision_severity','number_of_casualties','number_of_vehicles','date','day_of_week','time','light_conditions','weather_conditions','road_surface_conditions','collision_year','local_authority_ons_district']]

In [137]:
pd.set_option('display.max_columns',2000)
attendant_df.head()

Unnamed: 0,collision_index,legacy_collision_severity,number_of_casualties,number_of_vehicles,date,day_of_week,time,light_conditions,weather_conditions,road_surface_conditions,collision_year,local_authority_ons_district
0,0113CP00001,3 Slight,1,2,02/01/2013,Wednesday,'0735,1 Daylight,1 Fine,1 Road-Dry,2013,E09000001
1,0113CP00002,3 Slight,1,1,04/01/2013,Friday,'0658,2 Dark,1 Fine,1 Road-Dry,2013,E09000001
2,0113CP00003,3 Slight,1,2,05/01/2013,Saturday,'1700,2 Dark,1 Fine,1 Road-Dry,2013,E09000001
3,0113CP00005,3 Slight,1,1,11/01/2013,Friday,'0955,1 Daylight,1 Fine,1 Road-Dry,2013,E09000001
4,0113CP00006,2 Serious,1,1,20/01/2013,Sunday,'1232,1 Daylight,3 Snowing,3 Road-Snow,2013,E09000001


In [138]:
casualty_df.head()

Unnamed: 0,collision_index,casualty_reference,age_band_of_casualty,casualty_class,sex_of_casualty,age_of_casualty,casualty_severity
0,0113CP00001,1,25-59,1 Driver/Rider,1 Male,52,3 Slight
1,0113CP00002,1,25-59,3 Pedestrian,1 Male,36,3 Slight
2,0113CP00003,1,25-59,1 Driver/Rider,1 Male,26,3 Slight
3,0113CP00005,1,16-24,3 Pedestrian,2 Female,24,3 Slight
4,0113CP00006,1,16-24,3 Pedestrian,2 Female,23,2 Serious


In [139]:
print(attendant_df.shape)
print(casualty_df.shape)

(126466, 12)
(151003, 7)


In [140]:
casualty_df.isna().sum()

collision_index         0
casualty_reference      0
age_band_of_casualty    0
casualty_class          0
sex_of_casualty         0
age_of_casualty         0
casualty_severity       0
dtype: int64

In [141]:
casualty_df.sex_of_casualty.value_counts()

1 Male      97577
2 Female    53426
Name: sex_of_casualty, dtype: int64

In [142]:
casualty_df['sex_of_casualty'] = casualty_df['sex_of_casualty'].apply(lambda x : x[0])

In [143]:
casualty_df.casualty_class.value_counts()

1 Driver/Rider    95878
3 Pedestrian      28378
2 Passenger       26747
Name: casualty_class, dtype: int64

In [144]:
casualty_df['casualty_class'] = casualty_df['casualty_class'].apply(lambda x : x[0])

In [145]:
casualty_df.casualty_severity.value_counts()

3 Slight     138038
2 Serious     12323
1 Fatal         642
Name: casualty_severity, dtype: int64

In [146]:
casualty_df['casualty_severity'] = casualty_df['casualty_severity'].apply(lambda x : x[0])

In [147]:
#?
casualty_df.age_band_of_casualty.value_counts()

25-59      95067
16-24      26789
60+        12980
0-15       10296
Unknown     5871
Name: age_band_of_casualty, dtype: int64

In [148]:
casualty_df.head()

Unnamed: 0,collision_index,casualty_reference,age_band_of_casualty,casualty_class,sex_of_casualty,age_of_casualty,casualty_severity
0,0113CP00001,1,25-59,1,1,52,3
1,0113CP00002,1,25-59,3,1,36,3
2,0113CP00003,1,25-59,1,1,26,3
3,0113CP00005,1,16-24,3,2,24,3
4,0113CP00006,1,16-24,3,2,23,2


In [149]:
attendant_df.head()

Unnamed: 0,collision_index,legacy_collision_severity,number_of_casualties,number_of_vehicles,date,day_of_week,time,light_conditions,weather_conditions,road_surface_conditions,collision_year,local_authority_ons_district
0,0113CP00001,3 Slight,1,2,02/01/2013,Wednesday,'0735,1 Daylight,1 Fine,1 Road-Dry,2013,E09000001
1,0113CP00002,3 Slight,1,1,04/01/2013,Friday,'0658,2 Dark,1 Fine,1 Road-Dry,2013,E09000001
2,0113CP00003,3 Slight,1,2,05/01/2013,Saturday,'1700,2 Dark,1 Fine,1 Road-Dry,2013,E09000001
3,0113CP00005,3 Slight,1,1,11/01/2013,Friday,'0955,1 Daylight,1 Fine,1 Road-Dry,2013,E09000001
4,0113CP00006,2 Serious,1,1,20/01/2013,Sunday,'1232,1 Daylight,3 Snowing,3 Road-Snow,2013,E09000001


In [150]:
attendant_df.columns

Index(['collision_index', 'legacy_collision_severity', 'number_of_casualties',
       'number_of_vehicles', 'date', 'day_of_week', 'time', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'collision_year',
       'local_authority_ons_district'],
      dtype='object')

In [151]:
attendant_df.legacy_collision_severity.value_counts()

3 Slight     113957
2 Serious     11880
1 Fatal         629
Name: legacy_collision_severity, dtype: int64

In [152]:
attendant_df['legacy_collision_severity'] = attendant_df['legacy_collision_severity'].apply(lambda x : x[0])

In [153]:
attendant_df.day_of_week.value_counts()

Friday       20501
Wednesday    19509
Thursday     19453
Tuesday      19294
Monday       17729
Saturday     16540
Sunday       13440
Name: day_of_week, dtype: int64

In [154]:
days_mapping = {
    'Sunday   ' : 1,
    'Monday   ' : 2,
    'Tuesday  ' : 3,
    'Wednesday' : 4,
    'Thursday ' : 5,
    'Friday   ' : 6,
    'Saturday ' : 7
}
attendant_df['day_of_week'] = attendant_df['day_of_week'].apply(lambda x : days_mapping[x])

In [155]:
attendant_df.light_conditions.value_counts()

1 Daylight    87931
2 Dark        38535
Name: light_conditions, dtype: int64

In [156]:
attendant_df['light_conditions'] = attendant_df['light_conditions'].apply(lambda x : x[0])

In [157]:
attendant_df.weather_conditions.value_counts()

1 Fine                  106382
2 Raining                11834
9 Unknown                 4374
8 Other                   1762
4 Fine/High Winds          804
5 Raining/High Winds       774
3 Snowing                  305
7 Fog/Mist                 199
6 Snowing/High Winds        32
Name: weather_conditions, dtype: int64

In [158]:
attendant_df['weather_conditions'] = attendant_df['weather_conditions'].apply(lambda x : x[0])

In [159]:
attendant_df.road_surface_conditions.value_counts()

1 Road-Dry          101415
2 Road-Wet           22135
9 Unknown (S/R)       2083
4 Road-Frost/Ice       595
3 Road-Snow            190
5 Road-Flood            48
Name: road_surface_conditions, dtype: int64

In [160]:
attendant_df['road_surface_conditions'] = attendant_df['road_surface_conditions'].apply(lambda x : x[0])

In [161]:
attendant_df.road_surface_conditions.value_counts()

1    101415
2     22135
9      2083
4       595
3       190
5        48
Name: road_surface_conditions, dtype: int64

In [162]:
attendant_df['time'] = attendant_df['time'].apply(lambda x : f'{x[1:3]}:{x[3:]}')

In [163]:
attendant_df.collision_year.value_counts().head()

2017    27089
2014    25992
2015    25193
2016    25126
2013    23066
Name: collision_year, dtype: int64

In [164]:
casualty_df.head()

Unnamed: 0,collision_index,casualty_reference,age_band_of_casualty,casualty_class,sex_of_casualty,age_of_casualty,casualty_severity
0,0113CP00001,1,25-59,1,1,52,3
1,0113CP00002,1,25-59,3,1,36,3
2,0113CP00003,1,25-59,1,1,26,3
3,0113CP00005,1,16-24,3,2,24,3
4,0113CP00006,1,16-24,3,2,23,2


In [165]:
attendant_df.head()

Unnamed: 0,collision_index,legacy_collision_severity,number_of_casualties,number_of_vehicles,date,day_of_week,time,light_conditions,weather_conditions,road_surface_conditions,collision_year,local_authority_ons_district
0,0113CP00001,3,1,2,02/01/2013,4,07:35,1,1,1,2013,E09000001
1,0113CP00002,3,1,1,04/01/2013,6,06:58,2,1,1,2013,E09000001
2,0113CP00003,3,1,2,05/01/2013,7,17:00,2,1,1,2013,E09000001
3,0113CP00005,3,1,1,11/01/2013,6,09:55,1,1,1,2013,E09000001
4,0113CP00006,2,1,1,20/01/2013,1,12:32,1,3,3,2013,E09000001


In [166]:
casualty_df.shape

(151003, 7)

In [167]:
attendant_df.shape

(126466, 12)

In [169]:
df = pd.merge(attendant_df,casualty_df,how='inner')

In [171]:
df.tail()

Unnamed: 0,collision_index,legacy_collision_severity,number_of_casualties,number_of_vehicles,date,day_of_week,time,light_conditions,weather_conditions,road_surface_conditions,collision_year,local_authority_ons_district,casualty_reference,age_band_of_casualty,casualty_class,sex_of_casualty,age_of_casualty,casualty_severity
150998,1170092692,3,4,2,28/10/2017,7,10:20,1,1,1,2017,E09000010,1,25-59,1,1,49,3
150999,1170094487,3,1,1,11/11/2017,7,17:30,2,9,9,2017,E09000010,1,60+,1,2,60,3
151000,1170097147,3,1,1,22/09/2017,6,07:40,1,1,1,2017,E09000010,1,0-15,3,2,12,3
151001,1170105284,3,2,4,11/07/2017,3,23:30,2,9,9,2017,E09000010,1,60+,2,2,63,3
151002,1170105284,3,2,4,11/07/2017,3,23:30,2,9,9,2017,E09000010,2,25-59,2,1,40,3


In [None]:
# 1. What are the number of accidents across different boroughs in London City (UK) from
# the year Jul’13 to June’23?
# Casuality data, borough name
# 2. Which age band has the highest number of accidents in the last 10 years across
# London City?
# Casualty data, Casualty Age (5 Year Bands) and Casualty Age
# 3. Which weather & light conditions influence the number of accidents in specific areas
# of London City from 2013 to 2023?
# Attendant data and casualty data, Light conditions and weather details
# 4. Which hours of the day & days of the week have a higher number of accidents across
# London City from 2013 to 2023?
# Casualty data, Collision Date, need to derive day of the week 
# 5. Provide a trend of the top 5 areas having a higher number of casualties across London
# City from 2013 to 2023
# Casualty data, borough name
# 6. What are the top 5 most accident-affected areas in London City in each year of the past
# decade Jul’13 to Jun’23?
# casualty data, borough name