In [54]:
from sklearn import metrics
from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.externals.joblib import parallel_backend
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.ticker import PercentFormatter
import seaborn as sns
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import warnings
from sklearn.preprocessing import LabelEncoder
import numpy as np
from imblearn.over_sampling import SMOTE
from collections import Counter
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import pickle
warnings.filterwarnings('ignore')

In [55]:
import pandas as pd
import json

In [56]:
data = pd.read_csv("US_Accidents_Dec19.csv")
data.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


In [57]:
test_labels = data['Severity']
test_data = data.drop('Severity',axis=1)

In [58]:
from sklearn.cluster import MiniBatchKMeans
# this lib is used while we calculate the stight line distance between two (lat,lon) pairs in miles
import gpxpy.geo

def find_clusters(increment,data):
    kmeans = pickle.load(open('../input/kmeans-model/kmeans_model.sav', 'rb'))
    data['Cluster'] = kmeans.predict(data[['Start_Lng', 'Start_Lat']])
    cluster_centers = kmeans.cluster_centers_
    cluster_len = len(cluster_centers)
    return cluster_centers, cluster_len

def encode_weather(x,weather):
    """Encoding the weather by getting the count from the count dictionary created"""
    t = 10
    if weather.get(x,10)<=t:
        return 10
    else:
        return weather.get(x,10)
    
def encode_cyclic(data, col, max_val):
    """Encoding cyclic features using some formulas that preserve the order of variables"""
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data.drop(col,axis=1)

In [59]:
def save_obj(obj, name):
    with open('objects/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('../input/acc-data/objects_le/objects/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [60]:
model = pickle.load(open('../input/acc-data/finalized_model(1).sav', 'rb'))

In [61]:
# mean values of some numerical columns these will help when most values in the test dataset are null
means = {'Start_Lat': 35.63038181495091,
 'Start_Lng': -119.32277163452122,
 'Distance(mi)': 0.01135759999999605,
 'Temperature(F)': 66.63984696521987,
 'Humidity(%)': 59.79727746984023,
 'Pressure(in)': 29.96102322000202,
 'Visibility(mi)': 9.338445809646085}

In [62]:
# Final function 1
def final_fun_1(data):
    feature_lst=['Start_Lng','Start_Lat',\
         'Distance(mi)','Side','City','County','State','Timezone',\
         'Temperature(F)','Humidity(%)','Pressure(in)', 'Visibility(mi)',\
         'Wind_Direction','Weather_Condition','Amenity','Bump','Crossing',\
         'Give_Way','Junction','No_Exit','Railway','Roundabout','Station',\
         'Stop','Traffic_Calming','Traffic_Signal','Turning_Loop','Sunrise_Sunset',\
         'Start_Time','End_Time']
    #filling some nan values with most means
    for i in means.keys():
        if str(np.mean(data[i])).lower()=='nan':
            data[i].fillna(means[i],inplace=True)
        else:
            data[i].fillna(data[i].mean(),inplace=True)
    data = data[feature_lst].copy()
    test = data
    #removing features where number of null values>20% of entire values
    cols_to_keep = test.columns[test.isnull().mean() < 0.2]
    test = test[cols_to_keep]
    test['Start_Time'] = pd.to_datetime(test['Start_Time'], errors='coerce')
    test['End_Time'] = pd.to_datetime(test['End_Time'], errors='coerce')
    test['Sunrise_Sunset'].fillna('Day',inplace=True)
    test['Weather_Condition'].fillna('Clear',inplace=True)
    #filling nan categorical features with most common values
    test['Wind_Direction'].fillna('Calm',inplace=True)
    test['Timezone'].fillna('US/Eastern',inplace=True)
    test['City'].fillna('Houston',inplace=True)
    # Extract year, month, day, hour and weekday
    test['Year']=test['Start_Time'].dt.year
    test['Month']=test['Start_Time'].dt.month
    test['Day']=test['Start_Time'].dt.day
    test['Hour']=test['Start_Time'].dt.hour
    test['Weekday']=test['Start_Time'].dt.strftime('%a')
    # Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
    test['Duration']=round((test['End_Time']-test['Start_Time'])/np.timedelta64(1,'m'))
    # removing duration values which are negative
    neg_outliers=test['Duration']<=0
    # Set outliers to NAN
    test['Duration'][neg_outliers] = np.nan
    # Fill rows with negative duration with median
    test['Duration'].fillna(np.round(test['Duration'].mean()),inplace=True)
    test['Duration'].median()
    # hour of day feature using start time
    test['Hour'] = test['Start_Time'].apply(lambda x: pd.to_datetime(x).hour)
    ## dropping some more irrelevant columns
    test = test.drop(['Start_Time','End_Time'],axis=1)
    #encoding weekday
    day_dict = {'Sun':0,'Mon':1,'Tue':2,'Wed':3,'Thu':4,'Fri':5,'Sat':6}
    test['Weekday'] = test['Weekday'].apply(lambda x: day_dict.get(x,1))
    # encoding cyclic features    
    test = encode_cyclic(test, 'Month', 12)
    test = encode_cyclic(test, 'Hour', 24)
    #getting a new feature using cluster centers
    cluster_centers, cluster_len = find_clusters(1100,test)
    # extracting x, y and z from lat and long
    test['loc_x'] = np.cos(test['Start_Lat']) * np.cos(test['Start_Lng'])
    test['loc_y'] = np.cos(test['Start_Lat']) * np.sin(test['Start_Lng'])
    test['loc_z'] = np.sin(test['Start_Lat'])
    # label encoding the features
    le = load_obj('wind_le')
    test['Wind_Direction'] = test['Wind_Direction'].apply(lambda x: le.get(x,1))
    le = load_obj('sunrise_le')
    test['Sunrise_Sunset'] = test['Sunrise_Sunset'].apply(lambda x: le.get(x,0))
    le = load_obj('side_le')
    test['Side'] = test['Side'].apply(lambda x: le.get(x,0))
    le = load_obj('city_le')
    test['City'] = test['City'].apply(lambda x: le.get(x,00))
    le = load_obj('county_le')
    test['County'] = test['County'].apply(lambda x: le.get(x,00))
    le = load_obj('timezone_le')
    test['Timezone'] = test['Timezone'].apply(lambda x: le.get(x,00))
    le = load_obj('state_le')
    test['State'] = test['State'].apply(lambda x: le.get(x,00))
    le = load_obj('weather_le')
    # encoding this with value counts
    test['Weather_Condition'] = test['Weather_Condition'].apply(lambda x: le.get(x,10))
    #encoding boolean features with 0 or 1
    bool_features = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit",\
              "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop"]
    for i in bool_features:
        test[i]=test[i].apply(lambda x: 1 if str(x)=='True' else 0)
    test_predictions = model.predict(test)
    return test_predictions

In [63]:
predictions = final_fun_1(test_data[:10000])
predictions

array([3, 2, 2, ..., 3, 3, 3])

In [71]:
#predictions for a single point

a = ['A-3', 'MapQuest', 201.0, '2016-02-08 06:49:27',
        '2016-02-08 07:19:27', 39.063148, -84.032608, np.nan, np.nan, 0.01,
        'Accident on OH-32 State Route 32 Westbound at Dela Palma Rd. Expect delays.',
        np.nan, 'State Route 32', 'R', 'Williamsburg', 'Clermont', 'OH',
        '45176', 'US', 'US/Eastern', 'KI69', '2016-02-08 06:56:00', 36.0,
        33.3, 100.0, 29.67, 10.0, 'SW', 3.5, np.nan, 'Overcast', False,
        False, False, False, False, False, False, False, False, False,
        False, True, False, 'Night', 'Night', 'Day', 'Day']

df = pd.DataFrame(columns=test_data.columns)
df.loc[0] = a

In [72]:
%%time
final_fun_1(df)

CPU times: user 121 ms, sys: 8.07 ms, total: 129 ms
Wall time: 131 ms


array([2])

In [66]:
# Final function 2
def final_fun_2(data,labels):
    feature_lst=['Start_Lng','Start_Lat',\
         'Distance(mi)','Side','City','County','State','Timezone',\
         'Temperature(F)','Humidity(%)','Pressure(in)', 'Visibility(mi)',\
         'Wind_Direction','Weather_Condition','Amenity','Bump','Crossing',\
         'Give_Way','Junction','No_Exit','Railway','Roundabout','Station',\
         'Stop','Traffic_Calming','Traffic_Signal','Turning_Loop','Sunrise_Sunset',\
         'Start_Time','End_Time']
    #filling some nan values with most means
    for i in means.keys():
        if str(np.mean(data[i])).lower()=='nan':
            data[i].fillna(means[i],inplace=True)
        else:
            data[i].fillna(data[i].mean(),inplace=True)
    data = data[feature_lst].copy()
    test = data
    #to convert boolean columns to 0 or 1
    test = test*1
    #removing features where number of null values>20% of entire values
    cols_to_keep = test.columns[test.isnull().mean() < 0.2]
    test = test[cols_to_keep]
    test['Start_Time'] = pd.to_datetime(test['Start_Time'], errors='coerce')
    test['End_Time'] = pd.to_datetime(test['End_Time'], errors='coerce')
    test['Sunrise_Sunset'].fillna('Day',inplace=True)
    test['Weather_Condition'].fillna('Clear',inplace=True)
    #filling nan categorical features with most common values
    test['Wind_Direction'].fillna('Calm',inplace=True)
    test['Timezone'].fillna('US/Eastern',inplace=True)
    test['City'].fillna('Houston',inplace=True)
    # Extract year, month, day, hour and weekday
    test['Year']=test['Start_Time'].dt.year
    test['Month']=test['Start_Time'].dt.month
    test['Day']=test['Start_Time'].dt.day
    test['Hour']=test['Start_Time'].dt.hour
    test['Weekday']=test['Start_Time'].dt.strftime('%a')
    # Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
    test['Duration']=round((test['End_Time']-test['Start_Time'])/np.timedelta64(1,'m'))
    # removing duration values which are negative
    neg_outliers=test['Duration']<=0
    # Set outliers to NAN
    test['Duration'][neg_outliers] = np.nan
    # Fill rows with negative duration with median
    test['Duration'].fillna(np.round(test['Duration'].mean()),inplace=True)
    test['Duration'].median()
    # hour of day feature using start time
    test['Hour'] = test['Start_Time'].apply(lambda x: pd.to_datetime(x).hour)
    ## dropping some more irrelevant columns
    test = test.drop(['Start_Time','End_Time'],axis=1)
    #encoding weekday
    day_dict = {'Sun':0,'Mon':1,'Tue':2,'Wed':3,'Thu':4,'Fri':5,'Sat':6}
    test['Weekday'] = test['Weekday'].apply(lambda x: day_dict.get(x,1))
    # encoding cyclic features    
    test = encode_cyclic(test, 'Month', 12)
    test = encode_cyclic(test, 'Hour', 24)
    #getting a new feature using cluster centers
    cluster_centers, cluster_len = find_clusters(1100,test)
    # extracting x, y and z from lat and long
    test['loc_x'] = np.cos(test['Start_Lat']) * np.cos(test['Start_Lng'])
    test['loc_y'] = np.cos(test['Start_Lat']) * np.sin(test['Start_Lng'])
    test['loc_z'] = np.sin(test['Start_Lat'])
    # label encoding the features
    le = load_obj('wind_le')
    test['Wind_Direction'] = test['Wind_Direction'].apply(lambda x: le.get(x,1))
    le = load_obj('sunrise_le')
    test['Sunrise_Sunset'] = test['Sunrise_Sunset'].apply(lambda x: le.get(x,0))
    le = load_obj('side_le')
    test['Side'] = test['Side'].apply(lambda x: le.get(x,0))
    le = load_obj('city_le')
    test['City'] = test['City'].apply(lambda x: le.get(x,00))
    le = load_obj('county_le')
    test['County'] = test['County'].apply(lambda x: le.get(x,00))
    le = load_obj('timezone_le')
    test['Timezone'] = test['Timezone'].apply(lambda x: le.get(x,00))
    le = load_obj('state_le')
    test['State'] = test['State'].apply(lambda x: le.get(x,00))
    le = load_obj('weather_le')
    # encoding this with value counts
    test['Weather_Condition'] = test['Weather_Condition'].apply(lambda x: le.get(x,10))
    #encoding boolean features with 0 or 1
    bool_features = ["Amenity", "Bump", "Crossing", "Give_Way", "Junction", "No_Exit",\
              "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop"]
    for i in bool_features:
        test[i]=test[i].apply(lambda x: 1 if str(x)=='True' else 0)
    test_predictions = model.predict(test)
    score = f1_score(labels, test_predictions,average='weighted')
    return score

In [67]:
final_fun_2(test_data[100000:200000],test_labels[100000:200000])

0.8777753179276896