In [None]:
import pandas as pd
import numpy as np

import datetime as dt
from datetime import timedelta
from pandas.tseries.offsets import MonthBegin

# USED FOR WIDGETS, USER INTERFACE, AND PLOTTING
from IPython.display import display, clear_output
import ipywidgets as widgets
from ipywidgets import Layout, VBox, HBox
from matplotlib import pyplot as mplt
from bqplot import *

#import warnings
#warnings.filterwarnings("ignore")
import statsmodels.api as sm

# USED FOR ACCIDENT HOTSPOT & HEAT-MAP
import folium
from sklearn.cluster import DBSCAN as dbscan
from folium.plugins import HeatMap

In [None]:
df_all = pd.read_csv('crash.csv')

def to_date(datetime_string):
    crash_date = dt.datetime.strptime(datetime_string, '%m/%d/%Y %I:%M:%S %p').date()
    return crash_date

df_all['Date'] = df_all['Crash Date/Time'].apply(lambda x: to_date(x))
df = df_all[['Date', 'Road Name', 'Injury Severity', 'Longitude', 'Latitude']]

In [None]:
box_layout = widgets.Layout(display='flex',
                           align_items='center',
                           width='100%')

In [10]:
# PIE CHART

# Plot Pie Chart
top_20_roads = df['Road Name'].value_counts().head(20)
road_name_counts = pd.DataFrame({'Road Name':top_20_roads.index, 'Accident Count':top_20_roads.values})

total_accident_count = df['Road Name'].value_counts().sum()
top_accident_count = top_20_roads.values.sum()
road_name_counts.loc[11] = ['OTHER', total_accident_count-top_accident_count]

colors=['#606c38', '#283618', '#fefae0', '#dda15e', '#bc6c25']
road_name_counts.plot(figsize=(20,10), kind='pie', y='Accident Count', labels=road_name_counts['Road Name'], 
                      colors=colors)


# Design User Interface
pie_label = widgets.HTML(value = f"<b><font color='black'><font size=10><text-align='center'>{'Top 20 Roads with the Highest Accident Counts'}</b>")
pie_output = widgets.Output()

pie_ui = VBox([pie_label, pie_output], layout=box_layout)
with pie_output:
    mplt.legend(loc=3, prop={'size': 15})
    mplt.show()
pie_ui

VBox(children=(HTML(value="<b><font color='black'><font size=10><text-align='center'>Top 20 Roads with the Hig…

In [4]:

min_date = df['Date'].min()
max_date = df['Date'].max() - MonthBegin(1)
idx = pd.date_range(min_date, max_date) 

def forecast_time_series(road_name, start, end):
    series = df[df['Road Name'] == road_name]
    series = series.groupby(['Date'])['Road Name'].value_counts().unstack()
    series = series.reindex(idx, fill_value=0)
    series = series.resample('MS').sum()
    series.drop(series.index[-1], axis=0, inplace=True)
    
    series_x = series[(series.index>=pd.to_datetime(start)) & (series.index<=pd.to_datetime(end))]
    series_x = np.array(series_x.index)
    series_y = np.array(series[start:end][road_name].values)

    mod = sm.tsa.statespace.SARIMAX(series, order=(0, 1, 0), seasonal_order=(0, 1, 1, 12), enforce_stationarity=False, enforce_invertibility=False)

    results = mod.fit()

    x_sc = DateScale()
    y_sc = LinearScale()

    ax_x = Axis(label='Date', scale=x_sc, grid_lines='none')
    ax_y = Axis(label='Number of Accidents', scale=y_sc, grid_lines='none', orientation='vertical')

    observed_line = Lines(x=series_x, y=series_y, scales={'x': x_sc, 'y': y_sc}, colors=['#fefae0'], display_legend=True,  labels=['Observed'], stroke_width=4)

    if end >= max_date:
        pred = results.get_forecast(steps=3)
        pred_y = pred.predicted_mean
        pred_x = pd.date_range(pd.to_datetime(end)-MonthBegin(1), end+MonthBegin(3), freq='MS') 
        predicted_line = Lines(x=pred_x, y=pred_y, scales={'x': x_sc, 'y': y_sc}, colors=['#dda15e'], display_legend=True,  labels=['3-Month Forecast'], stroke_width=4)
        marks=[observed_line, predicted_line]
    else:
        marks=[observed_line]

    title='Monthly Accidents on '+ road_name
    fig = Figure(axes=[ax_x, ax_y], title=title, marks=marks, background_style={'fill':'#283618'}, figsize=(30,21))
    return fig

In [7]:
plot_label = widgets.HTML(value = f"<b><font color='black'><font size=7>{'Monthly Accidents By Road'}</b>")

road_names = df['Road Name'].value_counts().head(50)
road_names = np.array(road_names.index)
road_names.sort()
road_selector = widgets.Dropdown(options=road_names,
                                 value='GEORGIA AVE',
                                 description='Road Name',
                                 disabled=False)

dates = pd.date_range(df['Date'].min(), df['Date'].max(), freq='MS')
options = [(date.strftime(' %d %b %Y '), date) for date in dates]
date_selector = widgets.SelectionRangeSlider(options=options,
                                             index=(0, len(options)-1),
                                             description='Dates',
                                             orientation='horizontal',
                                             layout={'width': '800px'})

button = widgets.Button(description="Plot")
output = widgets.Output(layout=Layout(width='100%', height='100%'))


def showOutput(btn):    
    road_name = road_selector.value
    start=date_selector.value[0]
    end=date_selector.value[1]
    
    figure = forecast_time_series(road_name, start, end)
    with output:
        clear_output(wait=True)
        display(figure)

        
button.on_click(showOutput)

graph = HBox([output])
time_series_ui = VBox(children=[plot_label, road_selector, date_selector, button], layout=box_layout)
time_series_ui = VBox([time_series_ui, graph])
with output:
    time_series_figure = forecast_time_series(road_selector.value, date_selector.value[0], date_selector.value[1])
    clear_output(wait=True)
    display(time_series_figure)
time_series_ui

VBox(children=(VBox(children=(HTML(value="<b><font color='black'><font size=7>Monthly Accidents By Road</b>"),…

In [26]:
def to_hour(crash_time):
    crash_time = dt.datetime.strptime(crash_time, '%m/%d/%Y %I:%M:%S %p')
    return crash_time.hour

df_hourly = df_all[['Road Name','Crash Date/Time', 'Injury Severity']]
df_hourly['Hour'] = df_hourly['Crash Date/Time'].apply(lambda x: to_hour(x))
df_hourly.drop(['Crash Date/Time'], axis=1, inplace=True)


def plot_by_hour(road_selector):
    df_hr = df_hourly[df_hourly['Road Name'] == road_selector]
    df_hr = df_hr.groupby('Hour')['Injury Severity'].value_counts().unstack()
    df_hr = df_hr.replace(np.nan,0)

    legend = ['FATAL INJURY', 
              'SUSPECTED SERIOUS INJURY', 
              'SUSPECTED MINOR INJURY', 
              'POSSIBLE INJURY', 
              'NO APPARENT INJURY']

    empty = [0]*24
    empty = pd.DataFrame(empty)
    for label in legend:
        try: x = df_hr[label]
        except: df_hr[label] = empty

    df_hr = df_hr[legend]

    fatal = df_hr.iloc[:,0].tolist()
    serious = df_hr.iloc[:,1].tolist()
    minor = df_hr.iloc[:,2].tolist()
    possible = df_hr.iloc[:,3].tolist()
    none = df_hr.iloc[:,4].tolist()
    values = [fatal, serious, minor, possible, none]

    x_ord = OrdinalScale()
    y_sc = LinearScale()
    
    colors=['#dda15e', '#bc6c25','#fefae0', '#606c38', '#283618']
    bar = Bars(x=df_hr.index, y=values, 
               scales={'x': x_ord, 'y':y_sc}, 
               colors=colors, display_legend=True, type='stacked', 
               labels=['FATAL INJURY', 'SUSPECTED SERIOUS INJURY', 'SUSPECTED MINOR INJURY', 'POSSIBLE INJURY', 'NO APPARENT INJURY'])


    ax_x = Axis(scale=x_ord, grid_lines='none', label='Hour (24-Hour Format)')
    ax_y = Axis(scale=y_sc, grid_lines='dashed', orientation='vertical', tick_format='0.2f', label='Number of Accidents')

    title = "Hourly Accidents on " + road_selector
    fig = Figure(marks=[bar], title=title, legend_location='top-left', 
                 axes=[ax_x, ax_y], legend_style={'fill':'black', 'opacity':0.5}, 
                 legend_text={'font-size':10})
    return fig


bar_label = widgets.HTML(value = f"<b><font color='black'><font size=7>{'Hourly Accidents By Road & Injury Severity'}</b>")
bar_road_selector = widgets.Dropdown(
    options=road_names,
    value='GEORGIA AVE',
    description='Road Name',
    disabled=False
)
bar_output = widgets.Output(layout=Layout(width='100%', height='100%'))

def on_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        fig = plot_by_hour(change['new']) 
        with bar_output:
            clear_output(wait=True)
            display(fig)


bar_road_selector.observe(on_change)

bar_ui = VBox([bar_label, bar_road_selector, bar_output], layout=box_layout)
with bar_output:
    fig = plot_by_hour(bar_road_selector.value) 
    display(fig)
    clear_output(wait=True)
bar_ui

VBox(children=(HTML(value="<b><font color='black'><font size=7>Hourly Accidents By Road & Injury Severity</b>"…

HTML(value="<b><font color='black'><font size=10>Accident Hotspots</b>")

In [24]:
df_dbc = df
loc = df_dbc[['Latitude','Longitude']]

# 0.1, 20 returned 512 clusters  # colorful but very crowded
# 0.1, 50 returned 304 clusters  # better but still crowded
dbc = dbscan(eps=0.04/6371, min_samples=100, algorithm='ball_tree', metric='haversine').fit(np.radians(loc))

labels = dbc.labels_
unique_labels = np.unique(dbc.labels_)
df_dbc['Cluster'] = labels

start_location = df_dbc['Latitude'].mean(), df_dbc['Longitude'].mean()
m = folium.Map(location=start_location,zoom_start=12, min_zoom=12, max_zoom=17, tiles="Stamen Toner")

clust_colours = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a','#ffff99','#b15928']
for i in range(0,len(df_dbc)):
    colouridx = df_dbc['Cluster'].iloc[i]
    if colouridx == -1:
        pass
    else:
        col = clust_colours[colouridx%len(clust_colours)]
        folium.CircleMarker([df_dbc['Latitude'].iloc[i],df_dbc['Longitude'].iloc[i]], radius = 10, color = col, fill = col).add_to(m)

        
# Designing User Interface
hotspot_label = widgets.HTML(value = f"<b><font color='black'><font size=10>{'Accident Hotspots'}</b>")
hotspot_output = widgets.Output(layout=Layout(width='100%', height='100%'))

with hotspot_output:
    display(m)
hotspot_ui = VBox([hotspot_label, hotspot_output], layout=box_layout)
hotspot_ui

[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33]


VBox(children=(HTML(value="<b><font color='black'><font size=10>Accident Hotspots</b>"), Output(layout=Layout(…

In [7]:
# Heat-map Widgets

heatmap_label = widgets.HTML(value = f"<b><font color='black'><font size=10>{'Heatmap of Accidents Over Time'}</b>")
heatmap_ui = VBox([heatmap_label], layout=box_layout)
display(heatmap_ui)

hm_dates = pd.date_range(df['Date'].min(), df['Date'].max(), freq='D')
options = [(date.strftime(' %d %b %Y '), date) for date in hm_dates]

hm_date_selector = widgets.SelectionRangeSlider(
    options=options,
    index=(0, len(options)-1),
    description='Dates',
    orientation='horizontal',
    continuous_update=False,
    layout={'width': '1500px'}
)

VBox(children=(HTML(value="<b><font color='black'><font size=10>Heatmap of Accidents Over Time</b>"),), layout…

In [12]:
df_heatmap = df[['Date', 'Longitude', 'Latitude']]

def plot_heatmap(start, end):
    df_hm = df_heatmap[(df_heatmap['Date'] > pd.to_datetime(start)) &
                       (df_heatmap['Date'] < pd.to_datetime(end))]

    lat = float(df_hm['Latitude'].median())
    long = float(df_hm['Longitude'].median())
    m = folium.Map(location=[lat, long], zoom_start=12, min_zoom=8, max_zoom=17, tiles="Stamen Toner")
    hm = HeatMap(list(zip(df_hm.Latitude.values, df_hm.Longitude.values)),
                 name='Accidents',
                 min_opacity=0.8,
                 radius=5, blur=6)

    m.add_child(hm)
    display(m)
    clear_output(wait=True)
    
def plot_interactive_heatmap(date_selector):
    start=date_selector[0].date()
    end=date_selector[1].date()
    plot_heatmap(start, end)

widgets.interact(plot_interactive_heatmap, date_selector=hm_date_selector)


interactive(children=(SelectionRangeSlider(continuous_update=False, description='Dates', index=(0, 1728), layo…

<function __main__.plot_interactive_heatmap(date_selector)>