In [43]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

In [73]:
# Reading data and elimination of outliers
acc = pd.read_csv('dataset/accidents.csv').drop_duplicates()
acc = acc[acc.speed_limit<300] #Too fast
acc = acc[acc.latitude>40] # Not in UK
veh = pd.read_csv('dataset/vehicles.csv').drop_duplicates()

In [75]:
# Elimination and first processing
acc=acc[['accident_id', 'number_of_vehicles', 'number_of_casualties',
       'date', 'time', '1st_road_class', 'road_type',
       'speed_limit', 'junction_detail', 'junction_control', '2nd_road_class',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions',
       'urban_or_rural_area','target']]

In [76]:
categorical = dict([[name, True] for name in acc.columns])

In [77]:
acc['number_of_vehicles'] = acc['number_of_vehicles']
categorical['number_of_vehicles'] = False

In [78]:
acc['number_of_casualties'] = acc['number_of_casualties']
categorical['number_of_casualties'] = False

In [79]:
acc['date'] = acc.date.apply(lambda x: datetime.date(int(x[0:4]), int(x[5:7]), int(x[8:10])).weekday())

In [80]:
# Cyclical encoding
acc['time_sin'] = acc.time.apply(lambda x: np.sin(60*int(x[0:2]) + int(x[3:5])/(4/360*2*np.pi)))
acc['time_cos'] = acc.time.apply(lambda x: np.cos(60*int(x[0:2]) + int(x[3:5])/(4/360*2*np.pi)))
acc=acc.drop(columns='time')
categorical['time_sin'] = False
categorical['time_cos'] = False

In [82]:
# Numerical transformation and value replacing
translate_road = {
    '-1': -1,
    'Motorway': 0,
    'A(M)': 0,
    'A': 1,
    'B': 2,
    'C': 3,
    'Unclassified': 4
}
def trash(x):
    if (x['2nd_road_class']==-1):
        return x['1st_road_class']
    else:
        return x['2nd_road_class']
acc['1st_road_class'] = acc['1st_road_class'].apply(lambda x : translate_road[x])
acc['2nd_road_class'] = acc['2nd_road_class'].apply(lambda x : translate_road[x])
acc['2nd_road_class']=acc.apply(lambda x: trash(x), axis=1)
w=acc.apply(lambda x: min(x['1st_road_class'], x['2nd_road_class']), axis=1).copy()
acc['2nd_road_class']=acc.apply(lambda x: max(x['1st_road_class'], x['2nd_road_class']), axis=1)
acc['1st_road_class']=w

In [83]:
acc['speed_limit'] = acc.speed_limit.apply(lambda x: int(x))
categorical['speed_li'] = False

In [84]:
acc['junction_detail'] = acc['junction_detail']

In [85]:
acc['junction_control'] = acc['junction_control']

In [86]:
acc['pedestrian_crossing-human_control'] = acc['pedestrian_crossing-human_control']

In [87]:
acc['pedestrian_crossing-physical_facilities'] = acc['pedestrian_crossing-physical_facilities']

In [88]:
acc['light_conditions'] = acc['light_conditions']

In [89]:
acc['weather_conditions'] = acc['weather_conditions']

In [90]:
acc['road_surface_conditions'] = acc['road_surface_conditions']

In [91]:
acc.to_csv('acc_first_preprocessing.csv')