In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
def show_missing_rate(df: pd.DataFrame):
    missing = pd.DataFrame(df.isnull().sum()).reset_index()
    missing.columns = ['Feature', 'Missing_Percent(%)']
    missing['Missing_Percent(%)'] = missing['Missing_Percent(%)'].apply(lambda x: x / df.shape[0] * 100)
    print(missing.loc[missing['Missing_Percent(%)']>0,:])

In [3]:
ori_data = pd.read_csv('./data/US_Accidents_March23.csv')

In [4]:
drop_list = [
    'Source', 'Description', 'End_Lat', 'End_Lng',
    'Zipcode', 'Timezone', 'Airport_Code', 'ID',
    'Turning_Loop', 'Country', 'Precipitation(in)', 'Wind_Chill(F)'
]
ori_data.drop(drop_list,axis=1, inplace=True)


In [5]:
ori_data.drop(ori_data[ori_data['Nautical_Twilight'].isnull()].index, inplace=True)
ori_data.insert(loc=1, column='Twilight', value=[1]*len(ori_data))
twilight_list = ['Sunrise_Sunset', 'Civil_Twilight','Nautical_Twilight', 'Astronomical_Twilight']
#accumulate the twilight data
for tl in twilight_list:
    ori_data[tl] = ori_data[tl].apply(lambda x: 1 if x == 'Day' else 0)
def set_day_or_night(x):
    if x > 2:
        return 1
    elif x == 2:
        if random.random() > 0.5:
            return 0
        else:
            return 1
    else:
        return 0

    
ori_data['Twilight'] = ori_data[twilight_list].sum(axis=1).apply(set_day_or_night)
ori_data.drop(twilight_list, axis=1, inplace=True)

In [6]:
ori_data['Start_Time']=pd.to_datetime(ori_data['Start_Time'], format='ISO8601')
ori_data['End_Time']=pd.to_datetime(ori_data['End_Time'], format='ISO8601')
ori_data['elapsed_time'] = ori_data['End_Time'] - ori_data['Start_Time']
ori_data['elapsed_time'] = ori_data['elapsed_time']/np.timedelta64(1,'m')

ori_data['Minute']=ori_data['Hour']*60.0+ori_data["Start_Time"].dt.minute
ori_data['Hour'] = ori_data['Start_Time'].dt.hour
ori_data['Month'] = ori_data['Start_Time'].dt.month

nmonth = ori_data['Month']
days_each_month = np.cumsum(np.array([0,31,28,31,30,31,30,31,31,30,31,30,31]))
nday = [days_each_month[arg-1] for arg in nmonth.values]
nday = nday + ori_data["Start_Time"].dt.day.values
ori_data['Day'] = nday

ori_data['Weekday'] = ori_data['Start_Time'].dt.weekday
# ori_data['Year'] = ori_data['Start_Time'].dt.year
ori_data.drop('Start_Time', axis=1, inplace=True)
ori_data.drop('End_Time', axis=1, inplace=True)


In [7]:
# remove unnecessary parentheses and 'city'
ori_data['County'] = ori_data['County'].str.replace(r'\(|\)|city', '', case=False, regex=True)
ori_data['County'] = ori_data['County'].str.strip()


In [8]:
ori_data = ori_data.groupby('County').filter(lambda x: x['Temperature(F)'].notna().sum() >= 1 and x['Humidity(%)'].notna().sum() >= 1 and x['Visibility(mi)'].notna().sum() >= 1 and x['Wind_Speed(mph)'].notna().sum() >= 1 and x['Pressure(in)'].notna().sum() >= 1)

ori_data.drop(ori_data[ori_data['Weather_Timestamp'].isna()].index, inplace=True)
ori_data['Weather_Timestamp'] = pd.to_datetime(ori_data['Weather_Timestamp'])
ori_data.sort_values('Weather_Timestamp', inplace=True)

def fill_in_missing_value(missing_column: str, ori_data: pd.DataFrame):
    # interpolate missing data and then fill in the rest(usaully start or end of the data)
    ori_data[missing_column] = ori_data.groupby('County')[missing_column].transform(lambda x: x.interpolate(method='nearest').bfill().ffill())


fill_in_missing_value('Temperature(F)', ori_data)
fill_in_missing_value('Humidity(%)', ori_data)
fill_in_missing_value('Visibility(mi)', ori_data)
fill_in_missing_value('Wind_Speed(mph)', ori_data)
fill_in_missing_value('Pressure(in)', ori_data)


In [9]:
wind_serie = ori_data['Wind_Direction']
ori_data.drop(wind_serie[wind_serie.isnull()].index, inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['Calm'], value='CALM', inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['SSW', 'SSE', 'South'], value='S', inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['NNW', 'NNE', 'North'], value='N', inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['ESE', 'ENE', 'East'], value='E', inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['WSW', 'WNW', 'West'], value='W', inplace=True)
ori_data['Wind_Direction'].replace(to_replace=['Variable'], value='VAR', inplace=True)
print(ori_data['Wind_Direction'].unique())


['N' 'SW' 'CALM' 'W' 'S' 'NW' 'VAR' 'SE' 'E' 'NE']


In [10]:
ori_data.drop(ori_data[ori_data['Weather_Condition'].isnull()].index, inplace=True)
ori_data.drop(ori_data[ori_data['Weather_Condition'].str.contains('N/A', case=False)].index, inplace=True)
ori_data.drop(ori_data[ori_data['Street'].isnull()].index, inplace=True)


In [11]:
ori_data['Clear'] = np.where(ori_data['Weather_Condition'].str.contains('Clear|Fair', case=False, na=False), True, False)
ori_data['Cloud'] = np.where(ori_data['Weather_Condition'].str.contains('Cloud|Overcast', case=False, na=False), True, False)
ori_data['Rain'] = np.where(ori_data['Weather_Condition'].str.contains('Rain|Storm', case=False, na=False), True, False)
ori_data['Heavy_Rain'] = np.where(ori_data['Weather_Condition'].str.contains('Heavy Rain|Thunderstorms|Heavy T-Storm', case=False, na=False), True, False)
ori_data['Snow'] = np.where(ori_data['Weather_Condition'].str.contains('Snow|Hail|Sleet|Ice', case=False, na=False), True, False)
ori_data['Heavy_Snow'] = np.where(ori_data['Weather_Condition'].str.contains('Heavy Snow|Heavy Ice|Heavy Sleet', case=False, na=False), True, False)
ori_data['Fog'] = np.where(ori_data['Weather_Condition'].str.contains('Fog|Haze|Dust|Volcanic Ash|Smoke', case=False, na=False), True, False)
ori_data['Windy'] = np.where(ori_data['Weather_Condition'].str.contains('Wind|Tornado', case=False, na=False), True, False)

ori_data.drop('Weather_Condition', axis=1, inplace=True)

In [12]:

show_missing_rate(ori_data)
print(ori_data.shape)
print(ori_data.columns)

Empty DataFrame
Index: []
(7481003, 42)
Index(['Severity', 'Twilight', 'Start_Lat', 'Start_Lng', 'Distance(mi)',
       'Street', 'City', 'County', 'State', 'Weather_Timestamp',
       'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Direction', 'Wind_Speed(mph)', 'Amenity', 'Bump', 'Crossing',
       'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station',
       'Stop', 'Traffic_Calming', 'Traffic_Signal', 'elapsed_time', 'Hour',
       'Minute', 'Day', 'Month', 'Weekday', 'Clear', 'Cloud', 'Rain',
       'Heavy_Rain', 'Snow', 'Heavy_Snow', 'Fog', 'Windy'],
      dtype='object')


In [13]:
for col in ori_data.columns:
    print(col, ori_data[col].unique().size)

Severity 4
Twilight 2
Start_Lat 2359683
Start_Lng 2409346
Distance(mi) 21881
Street 328636
City 12266
County 1746
State 49
Weather_Timestamp 933678
Temperature(F) 859
Humidity(%) 100
Pressure(in) 1137
Visibility(mi) 91
Wind_Direction 10
Wind_Speed(mph) 184
Amenity 2
Bump 2
Crossing 2
Give_Way 2
Junction 2
No_Exit 2
Railway 2
Roundabout 2
Station 2
Stop 2
Traffic_Calming 2
Traffic_Signal 2
elapsed_time 73249
Hour 24
Minute 60
Day 31
Month 12
Weekday 7
Clear 2
Cloud 2
Rain 2
Heavy_Rain 2
Snow 2
Heavy_Snow 2
Fog 2
Windy 2


In [14]:
ori_data.to_csv('./data/v2-1.csv', index = False)
