In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import dataset
import utils

%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

sns.set_style('whitegrid')

plt.rcParams['figure.figsize'] = (18.0, 12.0)

In [2]:
carac = pd.read_csv("dataset/caracteristics.csv", encoding="latin-1", low_memory=False)
places = pd.read_csv("dataset/places.csv", encoding="utf-8", low_memory=False)
users = pd.read_csv("dataset/users.csv", encoding="utf-8", low_memory=False)
vehicles = pd.read_csv("dataset/vehicles.csv", encoding="utf-8", low_memory=False)
holi = pd.read_csv("dataset/holidays.csv", encoding="utf-8", low_memory=False)

In [3]:
carac = carac.rename(columns=dataset.column_names)
places = places.rename(columns=dataset.column_names)
users = users.rename(columns=dataset.column_names)
vehicles = vehicles.rename(columns=dataset.column_names)
holi = holi.rename(columns=dataset.column_names)

In [None]:
carac.head()

In [4]:
def to_datetime(r):
    hhmm = "%04d" % (r['Hour'])
    hh = hhmm[:2]
    mm = hhmm[-2:]
    date = "2%03d/%02d/%02d %s:%s" % (r['Year'] , r['Month'], r['Day'], hh, mm)
    r = pd.to_datetime(date, format='%Y/%m/%d %H:%M')
    return r

def par(data):
    return data.apply(to_datetime, axis=1)
 
carac['DateTime'] = utils.parall_df(carac, par)

In [5]:
carac['Date'] = carac['DateTime'].dt.date
carac['Time'] = carac['DateTime'].dt.time
carac['Hour'] = carac['DateTime'].dt.hour
carac['Minute'] = carac['DateTime'].dt.minute
carac['Weekdays'] = carac['DateTime'].dt.dayofweek # Monday=0, Sunday=6

In [None]:
carac.head()

In [None]:
holi.head()

In [6]:
from datetime import timedelta
period = 7
delta = timedelta(days=period)

holi['ds'] = pd.to_datetime(holi['ds'])
holi_range = pd.DataFrame()

for index, row in holi.iterrows():
    date = pd.to_datetime(row['ds'])
    dates = pd.date_range(date - delta, periods=1+(period*2), freq='d')
    days = dates.day
    months = dates.month
    holiday_ranges = dates - date
    df = pd.DataFrame({'Month': months, 'Day': days, 'HolidayRange': holiday_ranges.days})
    df['Holiday'] = row['holiday']
    holi_range = pd.concat([holi_range, df], ignore_index=True)

In [None]:
holi_range.head()

In [None]:
places.head()

In [None]:
users.head()

In [7]:
users['Users'] = 1
users = utils.dummies(users, 'UserCategory')
users = utils.dummies(users, 'Severity')
users = utils.dummies(users, 'Sex')


# users = utils.dummies(users, 'SafetyEquipment')
# users['DriverBirthday'] = users['an_nais'][users['Driver']]
# users.head()

In [None]:
#users.columns.values

In [8]:
columns = [
    'AccidentID', 'Users',
    'UserCategory_1', 'UserCategory_2', 'UserCategory_3', 'UserCategory_4', 
    'Severity_1', 'Severity_2', 'Severity_3', 'Severity_4', 
    'Sex_1', 'Sex_2'
]
group_users = users[columns].groupby(['AccidentID'])

In [27]:
agg_users = group_users.agg('sum')
agg_users['AccidentID'] = agg_users.index
agg_users.head()

Unnamed: 0_level_0,Users,UserCategory_1,UserCategory_2,UserCategory_3,UserCategory_4,Severity_1,Severity_2,Severity_3,Severity_4,Sex_1,Sex_2,AccidentID
AccidentID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
200500000001,6,2,4,0,0,4,0,1,1,4,2,200500000001
200500000002,2,2,0,0,0,1,0,1,0,2,0,200500000002
200500000003,2,2,0,0,0,1,0,1,0,2,0,200500000003
200500000004,4,3,1,0,0,0,0,2,2,2,2,200500000004
200500000005,2,1,1,0,0,0,0,1,1,2,0,200500000005


In [None]:
users[users['AccidentID'] == 200500000004]

In [None]:
vehicles[vehicles['AccidentID'] == 200500000004]

In [None]:
vehicles.head()

In [10]:
vehicles['Vehicles'] = 1

In [11]:
column = [
    'AccidentID', 'Vehicles'
]
group_vehicles = vehicles[column].groupby(['AccidentID'])
group_vehicles.head()

Unnamed: 0,AccidentID,Vehicles
0,201600000001,1
1,201600000001,1
2,201600000002,1
3,201600000003,1
4,201600000004,1
5,201600000004,1
6,201600000005,1
7,201600000005,1
8,201600000006,1
9,201600000007,1


In [26]:
agg_vehicles = group_vehicles.agg('sum')
agg_vehicles['AccidentID'] = agg_vehicles.index
agg_vehicles.head()

Unnamed: 0_level_0,Vehicles,AccidentID
AccidentID,Unnamed: 1_level_1,Unnamed: 2_level_1
200500000001,2,200500000001
200500000002,2,200500000002
200500000003,2,200500000003
200500000004,3,200500000004
200500000005,1,200500000005


In [14]:
c_h = pd.merge(carac, holi_range, on=['Month', 'Day'], how='left')

In [15]:
c_h_p = pd.merge(c_h, places, on='AccidentID')

In [28]:
c_h_p_u = pd.merge(c_h_p, agg_users, on='AccidentID')

In [31]:
c_h_p_u_v = pd.merge(c_h_p_u, agg_vehicles, on='AccidentID')

In [34]:
c_h_p_u_v.columns.values

array(['AccidentID', 'Year', 'Month', 'Day', 'Hour', 'LightingCondition',
       'Localisation', 'Intersection', 'AtmosphericCondition',
       'CollisionType', 'Municipality', 'Address', 'GpsCoding',
       'Latitude', 'Longitude', 'Department', 'DateTime', 'Date', 'Time',
       'Minute', 'Weekdays', 'HolidayRange', 'Holiday', 'RoadCategory',
       'RoadNumber', 'RouteNumber', 'RouteName', 'TrafficType',
       'NumberofLanes', 'HomePRNumber', 'PRDistance', 'OuterLane', 'prof',
       'LaneStructure', 'CentralLaneWidth', 'OuterLaneWidth',
       'SurfaceCondition', 'Infrastructure', 'SituationofAccident',
       'SchoolPoint', 'Users', 'UserCategory_1', 'UserCategory_2',
       'UserCategory_3', 'UserCategory_4', 'Severity_1', 'Severity_2',
       'Severity_3', 'Severity_4', 'Sex_1', 'Sex_2', 'Vehicles'],
      dtype=object)

In [32]:
column = []
merged = c_h_p_u_v[column]
merged.head()

Unnamed: 0,AccidentID,Year,Month,Day,Hour,LightingCondition,Localisation,Intersection,AtmosphericCondition,CollisionType,Municipality,Address,GpsCoding,Latitude,Longitude,Department,DateTime,Date,Time,Minute,Weekdays,HolidayRange,Holiday,RoadCategory,RoadNumber,RouteNumber,RouteName,TrafficType,NumberofLanes,HomePRNumber,PRDistance,OuterLane,prof,LaneStructure,CentralLaneWidth,OuterLaneWidth,SurfaceCondition,Infrastructure,SituationofAccident,SchoolPoint,Users,UserCategory_1,UserCategory_2,UserCategory_3,UserCategory_4,Severity_1,Severity_2,Severity_3,Severity_4,Sex_1,Sex_2,Vehicles
0,201600000001,16,2,1,14,1,2,1,8.0,3.0,5.0,"46, rue Sonneville",M,0.0,0,590,2016-02-01 14:45:00,2016-02-01,14:45:00,45,0,,,3.0,39,,,2.0,0.0,,,0.0,1.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0,2,2,0,0,0,1,0,1,0,1,1,2
1,201600000002,16,3,16,18,1,2,6,1.0,6.0,5.0,1a rue du cimetière,M,0.0,0,590,2016-03-16 18:00:00,2016-03-16,18:00:00,0,2,,,3.0,39,,,1.0,0.0,,,0.0,1.0,2.0,0.0,58.0,1.0,0.0,1.0,0.0,3,1,2,0,0,0,0,3,0,2,1,1
2,201600000003,16,7,13,19,1,1,1,1.0,6.0,11.0,,M,0.0,0,590,2016-07-13 19:00:00,2016-07-13,19:00:00,0,2,-1.0,Bastille Day,3.0,1,,,2.0,2.0,,,0.0,1.0,3.0,0.0,68.0,2.0,0.0,3.0,99.0,2,1,0,1,0,1,0,1,0,2,0,1
3,201600000003,16,7,13,19,1,1,1,1.0,6.0,11.0,,M,0.0,0,590,2016-07-13 19:00:00,2016-07-13,19:00:00,0,2,-1.0,Bastille Day,3.0,1,,,2.0,2.0,,,0.0,1.0,3.0,0.0,68.0,2.0,0.0,3.0,99.0,2,1,0,1,0,1,0,1,0,2,0,1
4,201600000003,16,7,13,19,1,1,1,1.0,6.0,11.0,,M,0.0,0,590,2016-07-13 19:00:00,2016-07-13,19:00:00,0,2,-1.0,Bastille Day,3.0,1,,,2.0,2.0,,,0.0,1.0,3.0,0.0,68.0,2.0,0.0,3.0,99.0,2,1,0,1,0,1,0,1,0,2,0,1


In [33]:
merged.to_csv('dataset/agg.csv.gz', compression='gzip', encoding='utf-8', index=False)