In [677]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import scipy
import sklearn
from category_encoders.target_encoder import TargetEncoder
from sklearn import preprocessing

## Train preprocessing

In [679]:
acc = pd.read_csv('acc_first_preprocessing.csv').drop_duplicates()
veh = pd.read_csv('dataset/vehicles_train.csv').drop_duplicates()

In [680]:
# Adding targets to the vehicles dataset
veh = pd.merge(acc[['accident_id','target']], veh, on='accident_id')

In [681]:
veh=veh[['accident_id',
       'Vehicle_Type', 'Towing_and_Articulation', 'Vehicle_Manoeuvre',
       'Vehicle_Location-Restricted_Lane', 'Junction_Location',
       'Skidding_and_Overturning', 'Hit_Object_in_Carriageway',
       'Vehicle_Leaving_Carriageway', 'Hit_Object_off_Carriageway',
       '1st_Point_of_Impact', 'Was_Vehicle_Left_Hand_Drive?',
       'Journey_Purpose_of_Driver', 'Sex_of_Driver', 'Age_of_Driver', 'Age_of_Vehicle',
       'Driver_IMD_Decile', 'Driver_Home_Area_Type', 'Vehicle_IMD_Decile','target']]

In [683]:
targett=veh[['target','accident_id']].copy()
veh2 = veh.copy()
veh2 = veh2[veh2.columns.difference(['target'])]

In [684]:
# Target encoding in categorical variables
target_encoder_cols=[
       'Vehicle_Type', 'Towing_and_Articulation', 'Vehicle_Manoeuvre',
       'Vehicle_Location-Restricted_Lane', 'Junction_Location',
       'Skidding_and_Overturning', 'Hit_Object_in_Carriageway',
       'Vehicle_Leaving_Carriageway', 'Hit_Object_off_Carriageway',
       '1st_Point_of_Impact', 'Was_Vehicle_Left_Hand_Drive?',
       'Journey_Purpose_of_Driver', 'Sex_of_Driver',
       'Driver_IMD_Decile', 'Driver_Home_Area_Type', 'Vehicle_IMD_Decile']
veh_enc = TargetEncoder(cols=target_encoder_cols)
veh = veh_enc.fit_transform(veh2, veh.target)

In [685]:
# Function to distinguish clumn names when merging datasets
def renaming(l, word):
    l2=[]
    for i in l:
        if (i!='accident_id'):
            l2.append(i+'_'+word)
        else:
            l2.append(i)
    return l2

In [686]:
# Max, mean and min transformation (dealing with variable number of cars)
maximum=veh.groupby(['accident_id']).max().reset_index()
maximum.columns=renaming(maximum.columns,'max')
average=veh.groupby(['accident_id']).mean().reset_index()
average.columns=renaming(average.columns,'mean')
minimum=veh.groupby(['accident_id']).min().reset_index()
minimum.columns=renaming(minimum.columns,'min')
a=pd.merge(maximum,average,on=['accident_id'])
final_veh=pd.merge(a,minimum,on=['accident_id'])
print(len(final_veh))
final_veh=pd.merge(final_veh,targett,on=['accident_id'], how='left')

136575


In [1077]:
acc=pd.read_csv('acc_first_preprocessing.csv',index_col=0)

In [689]:
# Target encoding of accidents categorical variables
acc_copy = acc.copy()
acc_copy = acc_copy[acc_copy.columns.difference(['target'])]
target_encoder_cols=['number_of_vehicles',
       'number_of_casualties', '1st_road_class', 'road_type',
       'speed_limit', 'junction_detail', 'junction_control', '2nd_road_class',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area']
acc_enc = TargetEncoder(cols=target_encoder_cols)
acc_target = acc_enc.fit_transform(acc_copy, acc.target)
acc_target.columns=renaming(acc_copy.columns,'target')

In [692]:
label_cols=['number_of_casualties', '1st_road_class', 'road_type', 'speed_limit', 'junction_detail',
       'junction_control', '2nd_road_class',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area']

In [693]:
# Label encoding of categorical variables
acc_label=acc.copy()
acc_label = acc_label[acc_label.columns.difference(['target'])]
for i in label_cols:
    acc_enc2 = preprocessing.LabelEncoder()
    acc_label[i] = pd.DataFrame(acc_enc2.fit_transform(acc[i].to_numpy()))
acc_label.columns=renaming(acc_label.columns,'label')

In [694]:
# Merge of the three datasets previously created
acc_total=pd.merge(acc_label,acc_target,on='accident_id')
final=pd.merge(acc_total,final_veh,on='accident_id')

In [696]:
final=final[['1st_road_class_label', '2nd_road_class_label',
       'date_label', 'junction_control_label', 'junction_detail_label',
       'light_conditions_label', 'number_of_casualties_label',
       'number_of_vehicles_label', 'pedestrian_crossing-human_control_label',
       'pedestrian_crossing-physical_facilities_label',
       'road_surface_conditions_label', 'road_type_label', 'speed_limit_label',
       'time_cos_label', 'time_sin_label', 'urban_or_rural_area_label',
       'weather_conditions_label', '1st_road_class_target',
       '2nd_road_class_target', 'date_target', 'junction_control_target',
       'junction_detail_target', 'light_conditions_target',
       'number_of_casualties_target', 'number_of_vehicles_target',
       'pedestrian_crossing-human_control_target',
       'pedestrian_crossing-physical_facilities_target',
       'road_surface_conditions_target', 'road_type_target',
       'speed_limit_target',
       'urban_or_rural_area_target', 'weather_conditions_target',
       '1st_Point_of_Impact_max', 'Age_of_Driver_max', 'Age_of_Vehicle_max',
       'Driver_Home_Area_Type_max', 'Driver_IMD_Decile_max',
       'Hit_Object_in_Carriageway_max', 'Hit_Object_off_Carriageway_max',
       'Journey_Purpose_of_Driver_max', 'Junction_Location_max',
       'Sex_of_Driver_max', 'Skidding_and_Overturning_max',
       'Towing_and_Articulation_max', 'Vehicle_IMD_Decile_max',
       'Vehicle_Leaving_Carriageway_max',
       'Vehicle_Location-Restricted_Lane_max', 'Vehicle_Manoeuvre_max',
       'Vehicle_Type_max', 'Was_Vehicle_Left_Hand_Drive?_max',
       '1st_Point_of_Impact_mean', 'Age_of_Driver_mean', 'Age_of_Vehicle_mean',
       'Driver_Home_Area_Type_mean', 'Driver_IMD_Decile_mean',
       'Hit_Object_in_Carriageway_mean', 'Hit_Object_off_Carriageway_mean',
       'Journey_Purpose_of_Driver_mean', 'Junction_Location_mean',
       'Sex_of_Driver_mean', 'Skidding_and_Overturning_mean',
       'Towing_and_Articulation_mean', 'Vehicle_IMD_Decile_mean',
       'Vehicle_Leaving_Carriageway_mean',
       'Vehicle_Location-Restricted_Lane_mean', 'Vehicle_Manoeuvre_mean',
       'Vehicle_Type_mean', 'Was_Vehicle_Left_Hand_Drive?_mean',
       '1st_Point_of_Impact_min', 'Age_of_Driver_min', 'Age_of_Vehicle_min',
       'Driver_Home_Area_Type_min', 'Driver_IMD_Decile_min',
       'Hit_Object_in_Carriageway_min', 'Hit_Object_off_Carriageway_min',
       'Journey_Purpose_of_Driver_min', 'Junction_Location_min',
       'Sex_of_Driver_min', 'Skidding_and_Overturning_min',
       'Towing_and_Articulation_min', 'Vehicle_IMD_Decile_min',
       'Vehicle_Leaving_Carriageway_min',
       'Vehicle_Location-Restricted_Lane_min', 'Vehicle_Manoeuvre_min',
       'Vehicle_Type_min', 'Was_Vehicle_Left_Hand_Drive?_min', 'target']]

In [697]:
final=final.drop_duplicates()

In [698]:
final.to_csv('preprocessed.csv')

In [700]:
np.save('preprocessed.npy', final.to_numpy())

### Test preprocessing

In [1044]:
# We repeat the same preprocessing but now with the test dataset (without the target variable)
acc = pd.read_csv('test_acc_first_preprocessing.csv').drop_duplicates()
veh = pd.read_csv('dataset/vehicles.csv').drop_duplicates()

In [1045]:
veh=pd.merge(acc,veh, on=['accident_id'], how='left')

In [1046]:
veh=veh[['accident_id',
       'Vehicle_Type', 'Towing_and_Articulation', 'Vehicle_Manoeuvre',
       'Vehicle_Location-Restricted_Lane', 'Junction_Location',
       'Skidding_and_Overturning', 'Hit_Object_in_Carriageway',
       'Vehicle_Leaving_Carriageway', 'Hit_Object_off_Carriageway',
       '1st_Point_of_Impact', 'Was_Vehicle_Left_Hand_Drive?',
       'Journey_Purpose_of_Driver', 'Sex_of_Driver', 'Age_of_Driver', 'Age_of_Vehicle',
       'Driver_IMD_Decile', 'Driver_Home_Area_Type', 'Vehicle_IMD_Decile']]

In [1047]:
veh = veh_enc.transform(veh)

In [1050]:
maximum=veh.groupby(['accident_id']).max().reset_index()
maximum.columns=renaming(maximum.columns,'max')
average=veh.groupby(['accident_id']).mean().reset_index()
average.columns=renaming(average.columns,'mean')
minimum=veh.groupby(['accident_id']).min().reset_index()
minimum.columns=renaming(minimum.columns,'min')
a=pd.merge(maximum,average,on=['accident_id'])
final_veh=pd.merge(a,minimum,on=['accident_id'])

In [1052]:
acc=pd.read_csv('test_acc_first_preprocessing.csv',index_col=0)

In [1054]:
acc=acc[['accident_id', 'number_of_vehicles', 'number_of_casualties', 'date',
        '1st_road_class', 'road_type', 'speed_limit', 'junction_detail',
       'junction_control', '2nd_road_class',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area',
       'time_sin', 'time_cos']]

In [1055]:
acc_target = acc_enc.transform(acc)
acc_target.columns=renaming(acc_target.columns,'target')

In [1056]:
label_cols=['number_of_casualties', '1st_road_class', 'road_type', 'speed_limit', 'junction_detail',
       'junction_control', '2nd_road_class',
       'pedestrian_crossing-human_control',
       'pedestrian_crossing-physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions', 'urban_or_rural_area']

In [1057]:
acc_label=acc.copy()
for i in label_cols:
    acc_enc2 = preprocessing.LabelEncoder()
    acc_label[i] = pd.DataFrame(acc_enc2.fit_transform(acc[i].to_numpy()))
acc_label.columns=renaming(acc_label.columns,'label')

In [1059]:
acc_total=pd.merge(acc_label,acc_target,on='accident_id')

In [1061]:
final=pd.merge(acc_total,final_veh,on='accident_id')

In [1062]:
final=final.drop(columns=['time_sin_target','time_cos_target'])

In [1064]:
final=final[['accident_id', 'number_of_vehicles_label', 'number_of_casualties_label',
       'date_label', '1st_road_class_label', 'road_type_label',
       'speed_limit_label', 'junction_detail_label', 'junction_control_label',
       '2nd_road_class_label', 'pedestrian_crossing-human_control_label',
       'pedestrian_crossing-physical_facilities_label',
       'light_conditions_label', 'weather_conditions_label',
       'road_surface_conditions_label', 'urban_or_rural_area_label',
       'time_sin_label', 'time_cos_label', 'number_of_vehicles_target',
       'number_of_casualties_target', 'date_target', '1st_road_class_target',
       'road_type_target', 'speed_limit_target', 'junction_detail_target',
       'junction_control_target', '2nd_road_class_target',
       'pedestrian_crossing-human_control_target',
       'pedestrian_crossing-physical_facilities_target',
       'light_conditions_target', 'weather_conditions_target',
       'road_surface_conditions_target', 'urban_or_rural_area_target',
       'Vehicle_Type_max', 'Towing_and_Articulation_max',
       'Vehicle_Manoeuvre_max', 'Vehicle_Location-Restricted_Lane_max',
       'Junction_Location_max', 'Skidding_and_Overturning_max',
       'Hit_Object_in_Carriageway_max', 'Vehicle_Leaving_Carriageway_max',
       'Hit_Object_off_Carriageway_max', '1st_Point_of_Impact_max',
       'Was_Vehicle_Left_Hand_Drive?_max', 'Journey_Purpose_of_Driver_max',
       'Sex_of_Driver_max', 'Age_of_Driver_max', 'Age_of_Vehicle_max',
       'Driver_IMD_Decile_max', 'Driver_Home_Area_Type_max',
       'Vehicle_IMD_Decile_max', 'Vehicle_Type_mean',
       'Towing_and_Articulation_mean', 'Vehicle_Manoeuvre_mean',
       'Vehicle_Location-Restricted_Lane_mean', 'Junction_Location_mean',
       'Skidding_and_Overturning_mean', 'Hit_Object_in_Carriageway_mean',
       'Vehicle_Leaving_Carriageway_mean', 'Hit_Object_off_Carriageway_mean',
       '1st_Point_of_Impact_mean', 'Was_Vehicle_Left_Hand_Drive?_mean',
       'Journey_Purpose_of_Driver_mean', 'Sex_of_Driver_mean',
       'Age_of_Driver_mean', 'Age_of_Vehicle_mean', 'Driver_IMD_Decile_mean',
       'Driver_Home_Area_Type_mean', 'Vehicle_IMD_Decile_mean',
       'Vehicle_Type_min', 'Towing_and_Articulation_min',
       'Vehicle_Manoeuvre_min', 'Vehicle_Location-Restricted_Lane_min',
       'Junction_Location_min', 'Skidding_and_Overturning_min',
       'Hit_Object_in_Carriageway_min', 'Vehicle_Leaving_Carriageway_min',
       'Hit_Object_off_Carriageway_min', '1st_Point_of_Impact_min',
       'Was_Vehicle_Left_Hand_Drive?_min', 'Journey_Purpose_of_Driver_min',
       'Sex_of_Driver_min', 'Age_of_Driver_min', 'Age_of_Vehicle_min',
       'Driver_IMD_Decile_min', 'Driver_Home_Area_Type_min','Vehicle_IMD_Decile_min']]

In [1066]:
# Function to fill the accident rows without vehicles associated
from random import random
import random
def randi(x,data):
    if (str(x)=='nan'):
        return float(random.choice(data.notnull()))
    else:
        return x

In [1078]:
final2=final.copy()
for i in list(final.columns):
    final[i]=final[i].apply(lambda x: randi(x,final2[i]))

In [989]:
final.to_csv('test.csv')

In [991]:
np.save('test.npy', final.to_numpy())