# Car accidents in the US

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Show matplotlib's plots in Jupyter notebook
%matplotlib inline

In [2]:
accidents = pd.read_csv('US_Accidents_June20.csv')
# parse_dates=['Start_Time', 'End_Time', 'Weather_Timestamp']

In [3]:
accidents.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [4]:
accidents.info()
# size: 1008.6+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float64
 3   Severity               int64  
 4   Start_Time             object 
 5   End_Time               object 
 6   Start_Lat              float64
 7   Start_Lng              float64
 8   End_Lat                float64
 9   End_Lng                float64
 10  Distance(mi)           float64
 11  Description            object 
 12  Number                 float64
 13  Street                 object 
 14  Side                   object 
 15  City                   object 
 16  County                 object 
 17  State                  object 
 18  Zipcode                object 
 19  Country                object 
 20  Timezone               object 
 21  Airport_Code           object 
 22  Weather_Timestamp 

# Duplicate values

In [5]:
accidents = accidents.drop_duplicates(subset=['Zipcode', 'Start_Time', 'End_Time'], keep='first')

# originele dataframe 3.513.617 rijen
# met deze dropped rijen 3.454.567 rows 

# Data types
check:
- [X] unieke values of ook rare waarden
- [X] missing values
 
33 - Bump                  - bool  - Schoon, alleen True en False\
34 - Crossing              - bool  - Schoon, alleen True en False\
35 - Give_Way              - bool  - Schoon, alleen True en False\
36 - Junction              - bool  - Schoon, alleen True en False\
37 - No_Exit               - bool  - Schoon, alleen True en False\
38 - Railway               - bool  - Schoon, alleen True en False\
39 - Roundabout            - bool  - Schoon, alleen True en False\
40 - Station               - bool  - Schoon, alleen True en False\
41 - Stop                  - bool  - Schoon, alleen True en False\
42 - Traffic_Calming       - bool  - Schoon, alleen True en False\
43 - Traffic_Signal        - bool  - Schoon, alleen True en False\
44 - Turning_Loop          - bool  - Alleen False --> verwijder variabele\
45 - Sunrise_Sunset        - object  - Night, Day, nan\
46 - Civil_Twilight        - object  - Night, Day, nan\
47 - Nautical_Twilight     - object  - Night, Day, nan\
48 - Astronomical_Twilight - object  - Night, Day, nan

In [6]:
#accidents_df['Severity'].unique()

In [7]:
# Change Severity to a category datatype.
#accidents['Severity'] = accidents['Severity'].astype('category')

#### Difference start time and end time

In [8]:
# Change Start_Time and End_Time to date-time variables
accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'])
accidents['End_Time'] = pd.to_datetime(accidents['End_Time'])
accidents['Weather_Timestamp'] = pd.to_datetime(accidents['Weather_Timestamp'])

In [9]:
# Create a new variable that measures total time of the accident. 
total_time = accidents['End_Time'] - accidents['Start_Time']

accidents.insert(loc=6, column='Total_Time', value=total_time)

In [10]:
#min(total_time)
#   '-1 days +23:25:55'
#max(total_time)
#   '987 days 11:15:29'


#accidents[accidents['Total_Time']>'30 days'].head()
#   22 rows waarbij de total time minder dan 0 is. 
#   Severity 2 en 3. 

#   326 rows waarbij de total time groter dan 30 dagen is.
#   Severity 2-3-4 (1 niet zichtbaar maar mogelijk wel erin).

In [11]:
# Drop all 22 rows where total time is less than 0
accidents = accidents[accidents['Total_Time']>'0']

In [12]:
# Create a new variable that changes total time of the accident to hours
my_list=[]

for obj in accidents['Total_Time']:
    my_list.append(int((obj.total_seconds())/60/60))  #Take the seconds of the deltatime object, then /60/60 to get hours
                
        
my_list

# Save as a new column in the data frame
#accidents.insert(loc=7, column='Total_Time_H', value=pd.Series(my_list))
accidents['Total_Time_H'] = my_list

In [13]:
accidents['Total_Time_H'].describe()

count    3.454545e+06
mean     1.304805e+00
std      4.694197e+01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      1.000000e+00
max      2.369900e+04
Name: Total_Time_H, dtype: float64

In [14]:
accidents['Total_Time_H'].isnull().sum()

0

#### Year of the accident

In [15]:
# Create a new variable for the year of the accidents (based off start year)
my_list=[]
for obj in accidents['Start_Time']:
    my_list.append(obj.year)
my_list

accidents.insert(loc=8, column='Year', value=my_list)

#### Month of the accident

In [16]:
# Create a new variable for the year of the accidents (based off start year)
my_list=[]
for obj in accidents['Start_Time']:
    my_list.append(obj.month)
my_list

accidents.insert(loc=8, column='Month', value=my_list)

In [17]:
accidents['Month'] = accidents['Month'].astype('category')

#### Hour of the day of the accident (based on Start_Time)

In [18]:
# Create a new variable for the year of the accidents (based off start year)
my_list=[]
for obj in accidents['Start_Time']:
    my_list.append(obj.hour)
my_list

accidents.insert(loc=8, column='Hour', value=my_list)

In [19]:
# Create a new variable for the year of the accidents (based off start year)
#my_list=[]
#for obj in accidents['Start_Time']:
#    new = obj.strftime(format='%H:%M')
#    my_list.append(new)
#my_list

#accidents.insert(loc=8, column='Time', value=my_list)

#### Day of the week

In [20]:
# Create a new variable for the day of the week of the accident. The day of the week with Monday=0, Sunday=6.
weekday = accidents['Start_Time'].dt.weekday

# turn the weekday into strings instead of integers
my_list = []

for day in weekday:
    if day == 0:
        my_list.append('monday')
    elif day == 1:
        my_list.append('tuesday')
    elif day == 2:
        my_list.append('wednesday')
    elif day == 3:
        my_list.append('thursday')
    elif day == 4:
        my_list.append('friday')
    elif day == 5:
        my_list.append('saturday')
    elif day == 6:
        my_list.append('sunday')

weekday = pd.Series(my_list)

accidents.insert(loc=9, column='Weekday', value=weekday)

In [21]:
accidents['Weekday'] = accidents['Weekday'].astype('category')

#### Season of the accident

In [22]:
# get season from the dates. Months 1-3 are winter, 4-6 are spring, etc.
my_list=[]

for date in accidents['Start_Time']:
    if date.month <= 3:
        season = 'winter'
    elif (date.month >= 4) & (date.month <= 6):
        season = 'spring'
    elif (date.month >= 7) & (date.month <= 9):
        season = 'summer'
    elif (date.month >= 10) & (date.month <= 12):
        season = 'fall'
    else:
        season = 'nothing???'
    my_list.append(season)

season = pd.Series(my_list)
#season.value_counts()

accidents.insert(loc=10, column='Season', value=season)

#### Create a new value for the sunset_sunrise: dusk/twilight
Create a new value for dusk/twilight using the other light columns. Vision is limited during twilight compared to during the day, but vision is better than during the night. 

In [23]:
light = []
error = 0

for i, value in accidents['Sunrise_Sunset'].items():
    if value == 'Day':
        light.append('Day')
    elif value == 'Night':
        if accidents['Astronomical_Twilight'].loc[i] == 'Day':
            light.append('Twilight')
        elif accidents['Astronomical_Twilight'].loc[i] == 'Night':
            light.append('Night')
    else:
        error += 1
        light.append(float('NaN'))
        # during nan values? count number of errors 
        
# Create a new variable in 'accidents' dataframe
accidents['Light'] = light

In [114]:
error
accidents['Light'].isna().sum()

109

In [24]:
# Drop the other light columns
#accidents = accidents.drop(['Sunrise_Sunset', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'],
#                          axis=1)

#### Create a new column: total number of obstructions reported. 

In [25]:
# dit kan makkelijker mischien door de lijst in te vullen, dan heb je niet al de herhalingen onderin.
obst = ['Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout',
        'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal']

def equals_true(series):
    my_list=[]
    for x in series:
        if x == True:
            my_list.append(1)
        else:
            my_list.append(0)
            
    # list_of_series:
    
    return pd.Series(my_list)
        
obst_bump = equals_true(accidents['Bump'])
obst_cros = equals_true(accidents['Crossing'])
obst_give = equals_true(accidents['Give_Way'])
obst_junc = equals_true(accidents['Junction'])
obst_noex = equals_true(accidents['No_Exit'])
obst_rail = equals_true(accidents['Railway'])
obst_roun = equals_true(accidents['Roundabout'])
obst_stat = equals_true(accidents['Station'])
obst_stop = equals_true(accidents['Stop'])
obst_tr_c = equals_true(accidents['Traffic_Calming'])
obst_tr_s = equals_true(accidents['Traffic_Signal'])

n_obst = (obst_bump + obst_cros + obst_give + obst_junc + obst_noex + obst_rail +
          obst_roun+ obst_stat + obst_stop + obst_tr_c + obst_tr_s)

In [26]:
accidents['Total_Obstr'] = n_obst

#### Create a new column: no obstructions reported

In [27]:
zero_obstr = []

for value in n_obst:
    if value == 0:
        zero_obstr.append(True)
    else:
        zero_obstr.append(False)

In [28]:
accidents['No_Obstr'] = my_list

#### Drop rows where the weather information was recorded >1 hour before or after the accident

In [29]:
# Ilse 
accidents["Diff_h_wt"] = accidents["Start_Time"] - accidents["Weather_Timestamp"] 
accidents["Diff_h_wt"] = accidents["Diff_h_wt"] / pd.Timedelta(hours=1)
accidents = accidents.drop(accidents[(accidents.Diff_h_wt < -1)].index)
accidents = accidents.drop(accidents[(accidents.Diff_h_wt > 1)].index)

In [31]:
accidents['Season'] = accidents['Season'].astype('category')