In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import pyarrow.parquet as pq
import awswrangler as wr
import calendar

data_URI = 's3://nypdcollisions/collisions.parquet'


In [2]:
# Retrieving the data directly from Amazon S3
all_collisions_df = wr.s3.read_parquet(data_URI)
# Modifying column names
all_collisions_df.columns = all_collisions_df.columns.str.lower()
all_collisions_df.columns = all_collisions_df.columns.str.replace(' ', '_')
all_collisions_df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,collision_id,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,04/14/2021,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,Unspecified,,,,4407480,Sedan,Sedan,,,
1,04/13/2021,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,,,,4407147,Sedan,,,,
2,04/15/2021,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,,,,4407665,Station Wagon/Sport Utility Vehicle,,,,
3,04/13/2021,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,...,Unspecified,,,,4407811,Sedan,,,,
4,04/12/2021,8:25,,,0.0,0.0,"(0.0, 0.0)",EDSON AVENUE,,,...,Unspecified,,,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [11]:
all_collisions_df['crash_date'] = pd.to_datetime(all_collisions_df['crash_date'])
all_collisions_df['crash_year'] = pd.DatetimeIndex(all_collisions_df['crash_date']).year
all_collisions_df['crash_month'] = pd.DatetimeIndex(all_collisions_df['crash_date']).month
# all_collisions_df['crash_month'] = all_collisions_df['crash_date'].dt.month_name()
all_collisions_df['crash_month_year'] = pd.to_datetime(all_collisions_df['crash_date']).dt.to_period('M')
historical_df = all_collisions_df[all_collisions_df.crash_year < 2022] # Removing 2022 data
historical_df = all_collisions_df[all_collisions_df.crash_year > 2012]
historical_df.head()

Unnamed: 0,crash_date,crash_time,borough,zip_code,latitude,longitude,location,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_5,collision_id,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,crash_year,crash_month,crash_month_year
0,2021-04-14,5:32,,,,,,BRONX WHITESTONE BRIDGE,,,...,,4407480,Sedan,Sedan,,,,2021,4,2021-04
1,2021-04-13,21:35,BROOKLYN,11217.0,40.68358,-73.97617,"(40.68358, -73.97617)",,,620 ATLANTIC AVENUE,...,,4407147,Sedan,,,,,2021,4,2021-04
2,2021-04-15,16:15,,,,,,HUTCHINSON RIVER PARKWAY,,,...,,4407665,Station Wagon/Sport Utility Vehicle,,,,,2021,4,2021-04
3,2021-04-13,16:00,BROOKLYN,11222.0,,,,VANDERVORT AVENUE,ANTHONY STREET,,...,,4407811,Sedan,,,,,2021,4,2021-04
4,2021-04-12,8:25,,,0.0,0.0,"(0.0, 0.0)",EDSON AVENUE,,,...,,4406885,Station Wagon/Sport Utility Vehicle,Sedan,,,,2021,4,2021-04


In [80]:
crash_causes = historical_df['contributing_factor_vehicle_1'].value_counts().rename_axis('unique_values').reset_index(name='counts')
top_10_crashes_causes = crash_causes[crash_causes.unique_values == 'Unspecified'].head(10)
top_10_crashes_causes

Unnamed: 0,unique_values,counts
0,Unspecified,594612


In [79]:
# fig = px.pie(top_10_crashes_causes, values='counts', names='unique_values', title='Top 10 Collision Causes')
# fig.show()

labels = top_10_crashes_causes.unique_values
values = top_10_crashes_causes.counts

# pull is given as a fraction of the pie radius
fig = go.Figure(data=[go.Pie(labels=labels, values=values, pull=[0.2, 0, 0, 0,0,0,0,0,0,0,0])])
fig.show()

In [56]:
text = historical_df[historical_df['contributing_factor_vehicle_1'] != 'Unspecified'][['contributing_factor_vehicle_1']]

In [57]:

crash_causes = pd.DataFrame(' '.join(text['contributing_factor_vehicle_1'].tolist()), columns=['Keyword'], index=[0])
crash_causes_string = crash_causes.Keyword.values

In [63]:
from wordcloud import WordCloud, STOPWORDS
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

def plotly_wordcloud(text):
    wc = WordCloud(stopwords = set(STOPWORDS),
                   max_words = 150,
                   max_font_size = 100)
    wc.generate(text)
    
    word_list=[]
    freq_list=[]
    fontsize_list=[]
    position_list=[]
    orientation_list=[]
    color_list=[]

    for (word, freq), fontsize, position, orientation, color in wc.layout_:
        word_list.append(word)
        freq_list.append(freq)
        fontsize_list.append(fontsize)
        position_list.append(position)
        orientation_list.append(orientation)
        color_list.append(color)
        
    # get the positions
    x=[]
    y=[]
    for i in position_list:
        x.append(i[0])
        y.append(i[1])
            
    # get the relative occurence frequencies
    new_freq_list = []
    for i in freq_list:
        new_freq_list.append(i*100)
    new_freq_list
    
    trace = go.Scatter(x=x, 
                       y=y, 
                       textfont = dict(size=new_freq_list,
                                       color=color_list),
                       hoverinfo='text',
                       hovertext=['{0}{1}'.format(w, f) for w, f in zip(word_list, freq_list)],
                       mode='text',  
                       text=word_list
                      )
    
    layout = go.Layout({'xaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False},
                        'yaxis': {'showgrid': False, 'showticklabels': False, 'zeroline': False}})
    
    fig = go.Figure(data=[trace], layout=layout)
    
    return fig

In [64]:
iplot(plotly_wordcloud(crash_causes_string[0]))

In [36]:
fig = px.bar(top_10_crashes_causes, x="unique_values", y="counts")
fig

In [12]:
historical_df.crash_date.max()

Timestamp('2022-01-04 00:00:00')

In [5]:
by_year_and_boro = historical_df.groupby(['crash_year', 'crash_month']).agg({
    'collision_id': 'count',
    'number_of_persons_injured': 'sum',
    'number_of_persons_killed': 'sum',
    'number_of_pedestrians_injured': 'sum',
    'number_of_pedestrians_killed': 'sum',
    'number_of_cyclist_injured': 'sum',
    'number_of_cyclist_killed': 'sum',
    'number_of_motorist_injured': 'sum',
    'number_of_motorist_killed': 'sum'
}).reset_index()
by_year_and_boro

Unnamed: 0,crash_year,crash_month,collision_id,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed
0,2013,1,15643,4043,29,1113,21,192,0,2738,8
1,2013,2,14399,3562,20,984,13,131,0,2447,7
2,2013,3,16509,4187,23,988,13,195,1,3004,9
3,2013,4,16439,4437,16,901,12,290,0,3246,4
4,2013,5,18486,5025,16,982,7,400,2,3643,7
...,...,...,...,...,...,...,...,...,...,...,...
104,2021,9,9882,4967,23,713,7,497,3,3480,13
105,2021,10,10192,5026,33,805,15,543,3,3416,12
106,2021,11,9344,4497,23,764,11,392,1,3153,11
107,2021,12,8394,4039,17,787,6,319,1,2771,9


In [9]:
fig = px.line(by_year_and_boro, x="crash_month", y="collision_id", color='crash_year')

fig.for_each_trace(lambda trace: fig.add_annotation(
    x=trace.x[-1], y=trace.y[-1], text='  '+trace.name, 
    font_color=trace.line.color,
    ax=10, ay=10, xanchor="left", showarrow=False))

# fig.update_yaxes(title='y', visible=False, showticklabels=True)    
fig.update_layout(
        xaxis=dict(
            showline=True,
            showgrid=False,
            showticklabels=True,
            linecolor='rgb(204, 204, 204)',
            linewidth=2,
            ticks='outside',
            tickfont=dict(
                family='Arial',
                size=12,
                color='rgb(82, 82, 82)',
            ),
        ),
        yaxis=dict(
            # showgrid=True,
            zeroline=False,
            showline=True,
            gridcolor = 'rgb(235, 236, 240)',
            showticklabels=True,
            title='',
            autorange=True
        ),
        autosize=True,
        hovermode="x unified",
        # margin=dict(
        #     autoexpand=True,
        #     l=100,
        #     r=20,
        #     t=110,
        # ),
        showlegend=False,
#         legend=dict(
#         # orientation="h",
#         yanchor="bottom",
#         y=0.9,
#         xanchor="left",
#         x=0.7
# ),
        plot_bgcolor='rgba(0,0,0,0)'
    )
fig.show()

In [6]:
aggregated_df = historical_df.groupby(['crash_year', 'crash_month_year', 'crash_month', 'crash_date', 'borough', 'on_street_name', 'contributing_factor_vehicle_1']).agg({
    'collision_id': 'count',
    'number_of_persons_injured': 'sum',
    'number_of_persons_killed': 'sum',
    'number_of_pedestrians_injured': 'sum',
    'number_of_pedestrians_killed': 'sum',
    'number_of_cyclist_injured': 'sum',
    'number_of_cyclist_killed': 'sum',
    'number_of_motorist_injured': 'sum',
    'number_of_motorist_killed': 'sum'
})
aggregated_df.rename(columns={
    'collision_id':'total_collisions'
}, inplace=True)
aggregated_df.reset_index(inplace=True)
aggregated_df.head()

Unnamed: 0,crash_year,crash_month_year,crash_month,crash_date,borough,on_street_name,contributing_factor_vehicle_1,total_collisions,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed
0,2012,2012-07,July,2012-07-01,,,Alcohol Involvement,1,0,0,0,0,0,0,0,0
1,2012,2012-07,July,2012-07-01,,,Backing Unsafely,3,0,0,0,0,0,0,0,0
2,2012,2012-07,July,2012-07-01,,,Driver Inattention/Distraction,8,2,0,0,0,0,0,2,0
3,2012,2012-07,July,2012-07-01,,,Failure to Keep Right,1,1,0,0,0,0,0,1,0
4,2012,2012-07,July,2012-07-01,,,Failure to Yield Right-of-Way,2,0,0,0,0,0,0,0,0


In [6]:
# fig = go.Figure()
# for contestant, year in aggregated_df.groupby("crash_year"):
#   fig.add_trace(go.Bar(x=year["crash_month"], y=year["total_collisions"]))
# fig.update_layout(legend_title_text = "Contestant")
# fig.update_xaxes(title_text="Fruit")
# fig.update_yaxes(title_text="Number Eaten")
# fig.show()

In [9]:
metrics = ['total_collisions', 'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured', 'number_of_pedestrians_killed', 'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured', 'number_of_motorist_killed']

years = list(range(2012, 2022))

aggregated_df[aggregated_df['crash_year'].isin([2012, 2013])]['total_collisions'].sum()

304279

In [10]:
years = list(range(2012, 2022))
metrics = aggregated_df.columns


def get_total(year, metric):
    
    total = aggregated_df[aggregated_df['crash_year'].isin([year])][metric].sum()
    results = {}
    results[metric] = total
    return results
    # df = pd.DataFrame({f'{metric}': total}, index=[year])
    # return df



for y in years:
    results = get_total(2012, 'total_collisions')

results

{'total_collisions': 100545}

In [141]:
# aggregated_df.groupby(['crash_year', 'crash_month', 'borough']).sum()

In [62]:
years = list(range(2012, 2022))
metrics = aggregated_df.columns
months = list(range(1,13))

total_collisions =[]
for year in years:
    total_collisions.append(aggregated_df.query(str(year))['number_of_persons_killed'].sum().tolist())
print(total_collisions)

[137, 297, 262, 243, 246, 256, 231, 244, 269, 286]


In [1]:
# fig = go.Figure()
# months = list(range(1,13))
# for year in years:
#     total_collisions =[]
#     total_collisions = total_collisions.append(aggregated_df.query(str(year))['number_of_persons_killed'].sum().tolist())
#     fig.add_trace(
#         go.Scatter(
#             x=months,
#             y=total_collisions
#         ))
# fig

In [132]:
aggregated_df.query('2022')['number_of_motorist_injured'].sum()

0