In [1]:
import os
import pandas as pd
import geopandas as gpd
import json
from shapely.geometry import Polygon, Point
pd.set_option('display.max_columns', None)
os.listdir('./data/')

['.gitattributes', 'Accidents0515.csv', 'london_poly.json', 'preprocessed']

In [2]:
accidents = pd.read_csv('./data/Accidents0515.csv')

In [3]:
# there is one column with many nan values
# since we do not know if this columns useful or not --> fill na with 'unknown' string
accidents.isna().sum(axis=0)

Accident_Index                                      0
Location_Easting_OSGR                             138
Location_Northing_OSGR                            138
Longitude                                         138
Latitude                                          138
Police_Force                                        0
Accident_Severity                                   0
Number_of_Vehicles                                  0
Number_of_Casualties                                0
Date                                                0
Day_of_Week                                         0
Time                                              151
Local_Authority_(District)                          0
Local_Authority_(Highway)                           0
1st_Road_Class                                      0
1st_Road_Number                                     0
Road_Type                                           0
Speed_limit                                         0
Junction_Detail             

In [4]:
accidents['LSOA_of_Accident_Location'].fillna('unknown', inplace=True)
accidents.dropna(inplace=True)

# drom features which we will not know in the new data
columns_to_drop = [
    'Accident_Index',
    'Accident_Severity',
    'Did_Police_Officer_Attend_Scene_of_Accident',
    'Number_of_Casualties',
    'Number_of_Vehicles',
    'Police_Force',
]
accidents.drop(columns=columns_to_drop, inplace=True)

In [5]:
with open('./data/london_poly.json') as json_fin:
    js_line = json_fin.read()
london_gdf = gpd.read_file(json.loads(js_line))
london_poly = london_gdf['geometry'][0]

In [6]:
accidents[['Date', 'Day_of_Week', 'Time']]

Unnamed: 0,Date,Day_of_Week,Time
0,04/01/2005,3,17:42
1,05/01/2005,4,17:36
2,06/01/2005,5,00:15
3,07/01/2005,6,10:35
4,10/01/2005,2,21:13
...,...,...,...
1780648,11/12/2015,6,13:24
1780649,02/12/2015,4,13:50
1780650,23/12/2015,4,00:01
1780651,26/12/2015,7,12:40


In [7]:
accidents.head()

Unnamed: 0,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Date,Day_of_Week,Time,Local_Authority_(District),Local_Authority_(Highway),1st_Road_Class,1st_Road_Number,Road_Type,Speed_limit,Junction_Detail,Junction_Control,2nd_Road_Class,2nd_Road_Number,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,LSOA_of_Accident_Location
0,525680.0,178240.0,-0.19117,51.489096,04/01/2005,3,17:42,12,E09000020,3,3218,6,30,0,-1,-1,0,0,1,1,2,2,0,0,1,E01002849
1,524170.0,181650.0,-0.211708,51.520075,05/01/2005,4,17:36,12,E09000020,4,450,3,30,6,2,5,0,0,5,4,1,1,0,0,1,E01002909
2,524520.0,182240.0,-0.206458,51.525301,06/01/2005,5,00:15,12,E09000020,5,0,6,30,0,-1,-1,0,0,0,4,1,1,0,0,1,E01002857
3,526900.0,177530.0,-0.173862,51.482442,07/01/2005,6,10:35,12,E09000020,3,3220,6,30,0,-1,-1,0,0,0,1,1,1,0,0,1,E01002840
4,528060.0,179040.0,-0.156618,51.495752,10/01/2005,2,21:13,12,E09000020,6,0,6,30,0,-1,-1,0,0,0,7,1,2,0,0,1,E01002863


In [8]:
accidents['datetime'] = pd.to_datetime(accidents['Date'] + '/' + accidents['Time'])

In [9]:
accidents.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1780364 entries, 0 to 1780652
Data columns (total 27 columns):
 #   Column                                   Dtype         
---  ------                                   -----         
 0   Location_Easting_OSGR                    float64       
 1   Location_Northing_OSGR                   float64       
 2   Longitude                                float64       
 3   Latitude                                 float64       
 4   Date                                     object        
 5   Day_of_Week                              int64         
 6   Time                                     object        
 7   Local_Authority_(District)               int64         
 8   Local_Authority_(Highway)                object        
 9   1st_Road_Class                           int64         
 10  1st_Road_Number                          int64         
 11  Road_Type                                int64         
 12  Speed_limit                 

In [10]:
london_gdf

Unnamed: 0,id,geometry
0,0,"POLYGON ((-0.51038 51.46809, -0.51036 51.46795..."


In [11]:
accidents['geom'] = accidents.apply(lambda x: Point(x['Longitude'], x['Latitude']), axis=1)

In [12]:
accidents['in_london'] = accidents['geom'].apply(lambda x: x.intersects(london_poly))

In [13]:
columns_to_drop = [
    'Date', 'Time', 
    # 'Longitude', 'Latitude'
                  ]
accidents.drop(columns=columns_to_drop, inplace=True)

In [14]:
accidents_filtered = accidents[accidents['in_london']].drop(columns=['in_london'])

In [21]:
accidents_filtered['Latitude'].min()

51.28906

In [None]:
def get_lat_lon_mult():
    return (1.001 - 0.999) * np.random.random_sample() + 0.999 # about 100 meters shift

In [16]:
for row in accidents_filtered.iterrows():
    for _ in range(3):
        new_lat = row['Latitude'] * get_lat_lon_mult()
#         new_lon = row['Lon'] * get_lat_lon_mult()

(268435, 26)

In [None]:
accidents_filtered.to_pickle('./data/preprocessed/accidents_in_london.pkl')