# Project : In Montomery County, MD, what factors have the biggest influence in car accidents?
# Factors Analyzed: Hour of Accident, Vehicle Year, Speed Limit, Weather, Direction, Driver Fault 

### Team - Zamunda Warriors: Amari, Isaac, Nicolas, Terrence
### Data provided by [Montgomery County MD](https://data.montgomerycountymd.gov/)

In [1]:
import pip

def import_or_install(package):
    try:
        return __import__(package)
    except ImportError:
        pip.main(['install', package]) 
        return __import__(package)

os = import_or_install("os")
pd = import_or_install("pandas")
import_or_install("matplotlib")
plt = import_or_install("matplotlib.pyplot")
np = import_or_install("numpy")
dash = import_or_install("dash")
dcc = import_or_install("dash_core_components")
html = import_or_install("dash_html_components")
plotly = import_or_install("plotly")
go = plotly.graph_objects
import_or_install("plotly.express")
px = plotly.express

#read in driver dataset
drivers = pd.read_csv("https://raw.githubusercontent.com/terrydiko/INST447Project/main/Drivers.csv", encoding='utf-8', dtype="str")
drivers


#read in incidents dataset
incidents = pd.read_csv("https://raw.githubusercontent.com/terrydiko/INST447Project/main/Incidents.csv", encoding= 'utf-8', dtype="str")
incidents


#merge the two datasets
merge_data = incidents.merge(drivers, on='report_number', how='inner')
merge_data["vehicle_year"] = merge_data["vehicle_year"].apply(lambda x : int(x))
merge_data = merge_data.dropna()

## At this point, we know what factors we want to analyze to answer the main question. Now, we begin that the data cleaning process. That starts with identifying the unique values of each factor, and then get rid of any null, blank, unknown, or erroneous values.

In [2]:
# Unique values for year, weather, direction, speed limit, and driver fault
print('Year Values: ' + str(sorted(merge_data["vehicle_year"].unique())))
print("\n")
print('Weather  Values: ' + str(sorted(merge_data["weather"].unique())))
print("\n")
print('Direction Values: ' + str(sorted(merge_data["direction"].unique())))
print("\n")
print('Speed Limit Values: ' + str(sorted(merge_data["speed_limit"].unique())))
print("\n")
print('Driver Fault Values: ' + str(sorted(merge_data["driver_at_fault"].unique())))
print("\n")

#cleaning merged data based on unique value results
merge_data = merge_data.query("vehicle_year >= 1960 and vehicle_year <= 2021")
merge_data = merge_data.query("direction != 'Unknown'")
merge_data = merge_data.query("driver_at_fault != 'Unknown'")
merge_data = merge_data.query("weather not in ['OTHER','UNKNOWN']")
merge_data.head()

Year Values: [0, 1, 2, 3, 4, 8, 13, 14, 15, 97, 99, 198, 200, 201, 202, 215, 1005, 1014, 1015, 1025, 1111, 1140, 1234, 1900, 1901, 1947, 1949, 1955, 1959, 1960, 1963, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2033, 2040, 2041, 2055, 2099, 2100, 2103, 2104, 2200, 2204, 2911, 2912, 2914, 2917, 2991, 2996, 2998, 3003, 3013, 3863, 5005, 8008, 8888, 9999]


Weather  Values: ['BLOWING SAND, SOIL, DIRT', 'BLOWING SNOW', 'CLEAR', 'CLOUDY', 'FOGGY', 'OTHER', 'RAINING', 'SEVERE WINDS', 'SLEET', 'SNOW', 'UNKNOWN', 'WINTRY MIX']


Direction Values: ['East', 'North', 'South', 'Unknown', 'West']


Speed Limit Values: ['0', '10', '15', '20', '25', '30', '35', '40', '45', '5', '50', '55', '60', '65', '70']


Unnamed: 0,report_number,agency_name,acrs_report_type,crash_date_time,direction,weather,latitude,longitude,driver_at_fault,speed_limit,vehicle_year
1,MCP3290000G,Montgomery County Police,Property Damage Crash,2021-04-28T21:01:00.000,South,CLEAR,39.059336,-76.944363,Yes,40,2006
2,MCP3283000M,Montgomery County Police,Property Damage Crash,2021-04-28T20:37:00.000,East,CLEAR,39.0471,-77.225175,Yes,30,2018
3,MCP2892009B,Montgomery County Police,Property Damage Crash,2021-04-28T17:58:00.000,South,CLEAR,39.01532,-77.042505,No,35,2013
5,MCP2899004S,Montgomery County Police,Property Damage Crash,2021-04-28T17:34:00.000,South,CLEAR,39.139095,-77.26846667,Yes,55,2015
6,MCP2864003Y,Montgomery County Police,Property Damage Crash,2021-04-28T16:26:00.000,West,CLEAR,39.10029167,-77.18490833,Yes,40,2006


## The merged dataset is now cleaned and ready for further analysis. We now move into creating sub dataframes for each factor, getting the count for each of the factor values.

In [3]:
#strip time from Date/Time column
merge_data[['Date', 'Time']] = merge_data['crash_date_time'].str.split('T', expand=True)
merge_data = merge_data.drop(['crash_date_time'], axis=1)

#calculate hour of day
merge_data['Hour'] = pd.to_datetime(merge_data['Time']).dt.hour
merge_data.head()

Unnamed: 0,report_number,agency_name,acrs_report_type,direction,weather,latitude,longitude,driver_at_fault,speed_limit,vehicle_year,Date,Time,Hour
1,MCP3290000G,Montgomery County Police,Property Damage Crash,South,CLEAR,39.059336,-76.944363,Yes,40,2006,2021-04-28,21:01:00.000,21
2,MCP3283000M,Montgomery County Police,Property Damage Crash,East,CLEAR,39.0471,-77.225175,Yes,30,2018,2021-04-28,20:37:00.000,20
3,MCP2892009B,Montgomery County Police,Property Damage Crash,South,CLEAR,39.01532,-77.042505,No,35,2013,2021-04-28,17:58:00.000,17
5,MCP2899004S,Montgomery County Police,Property Damage Crash,South,CLEAR,39.139095,-77.26846667,Yes,55,2015,2021-04-28,17:34:00.000,17
6,MCP2864003Y,Montgomery County Police,Property Damage Crash,West,CLEAR,39.10029167,-77.18490833,Yes,40,2006,2021-04-28,16:26:00.000,16


In [4]:
#calculate accidents by hour
hour = merge_data[['report_number', 'Hour']]
hour = hour.groupby('Hour').count().reset_index()
hour.columns = ['Hour', 'Number of Accidents']
hour = hour.sort_values(by='Number of Accidents', ascending=False)

fig_hour = px.bar(hour, x='Hour', y='Number of Accidents', color='Hour', template="presentation", title="Number of Accidents by Hour", color_continuous_scale=px.colors.sequential.Sunsetdark[1:])

hour.head()

Unnamed: 0,Hour,Number of Accidents
17,17,8164
16,16,7710
15,15,7498
18,18,7137
8,8,6530


In [5]:
year = merge_data[["report_number", "vehicle_year"]]
year = year.groupby("vehicle_year").count().reset_index()
year.columns = ["Vehicle Year", "Number of Accidents"]
year = year.sort_values(by="Vehicle Year", ascending=False)
year = year.head(20).sort_values(by="Vehicle Year")
year["Vehicle Year"] = year["Vehicle Year"].apply(lambda x : int(x))
year = year.sort_values(by='Number of Accidents', ascending=False)
year.head()

fig_year = px.bar(year, x='Vehicle Year', y='Number of Accidents', color='Vehicle Year', template="presentation", title="Number of Accidents by Vehicle Year", color_continuous_scale=px.colors.sequential.tempo[2:len(px.colors.sequential.tempo)-1])

In [6]:
#calculate accidents by speed limit
speed = merge_data[["report_number", "speed_limit"]]
speed = speed.groupby("speed_limit").count().reset_index()
speed.speed_limit = speed.speed_limit.astype(int)
speed.columns = ["Speed Limit", "Number of Accidents"]
speed = speed.sort_values('Number of Accidents', ascending=False)
speed.head()

fig_speed = px.bar(speed, x='Speed Limit', y='Number of Accidents', color='Speed Limit', template="presentation", title="Number of Accidents by Speed Limit", color_continuous_scale=px.colors.sequential.speed[2:len(px.colors.sequential.speed)-1])

In [7]:
# calculate accidents by weather
weather = merge_data[["report_number", "weather"]]
weather = weather.groupby("weather").count().reset_index()
weather.columns = ["weather", "num_accidents"]
weather = weather.sort_values(by="num_accidents", ascending=False)
weather.head()

fig_weather = px.pie(weather, values='num_accidents', names='weather', title="Number of Accidents by Weather", template='presentation', color='weather', color_discrete_sequence=px.colors.sequential.Turbo[2:], height=500)

In [8]:
# calculate accidents by direction 
direction = merge_data[["report_number", "direction"]]
direction = direction.groupby("direction").count().reset_index()
direction.columns = ["direction", "num_accidents"]
direction = direction.sort_values(by="num_accidents", ascending=False)
direction.head()

fig_direction = px.pie(direction, values='num_accidents', names='direction', title="Number of Accidents by Direction", template='presentation', color='direction', color_discrete_map={"North": "cornflowerblue", "East":"gold", "South":"limegreen", "West":"orangered"})

In [9]:
# calculate accidents by driver fault
fault= merge_data[["report_number", "driver_at_fault"]]
fault = fault.groupby("driver_at_fault").count().reset_index()
fault.columns = ["fault", "num_accidents"]
fault = fault.sort_values(by="num_accidents", ascending=False)
fault.head()

fig_fault = px.pie(fault, values='num_accidents', names='fault', title="Number of Accidents by Driver Fault", template='presentation', color='fault', color_discrete_map={"No": "red", "Yes":"green"})

In [10]:
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.title = 'INST 447 Project - Team Zamunda Warriors'
app.layout = html.Div([
        html.Label("PROJECT: In Montgomery County, MD, what factors have the biggest influence in car accidents?", style={"text-align": "center", "font-weight": "bold", "font-size": "2em"}),
        html.Label("FACTORS: Hour of Accident, Vehicle Year, Speed Limit, Weather, Direction, Driver Fault", style={"text-align": "center", "font-weight": "bold", "font-size": "2em"}),
        html.Div([
            html.Label("Team - Zamunda Warriors: Amari, Isaac, Nicolas, Terrence", style={"text-align": "center", "font-weight": "bold", "font-size": "1.17em"}),
            html.Label([
                "Data Provided by ",
                dcc.Link("Montgomery County MD", href="https://data.montgomerycountymd.gov/")
            ], style={"text-align": "center", "font-weight": "bold", "font-size": "1.17em"})
        ]),
        html.Div([
            html.Label(
            "Data Analysis"
            , style={"font-weight": "bold", "font-size": "1.5em"}),
            dcc.Graph(figure=fig_hour),
            html.P([
                html.Span("Accidents by Hour: ", style={"font-weight": "bold"}),
                "The number of accidents by hour provides an interesting dynamic. As shown by the graph, the top 3 hours are 5pm, 4pm, and 3pm. Then, 8am comes in as the 4th highest count of all the hours. The count then declines with a mix of morning and early evening hours. We assume that this is due to more traffic on the road. The morning rush of people going to work and evening rush hour of people heading home are why there is a higher count of accidents during the morning and late afternoon."
            ], style={"text-align": "justify"}),
            dcc.Graph(figure=fig_year),
            html.P([
                html.Span("Accidents by Vehicle Year: ", style={"font-weight": "bold"}),
                "This graph focuses on vehicle years between 2002-2021. Our initial thought was that older vehicles are more prone to accidents due to newer vehicles having more safety features. However, that was not the case. The graph shows that vehicles of the 2015 year lead the amount of vehicles involved in accidents. In fact, the top 5 years were all in the 2010s. The count per year has a steady decline for the most part. We also can consider the fact that not as many older vehicles may be on the road as older vehicles."
            ], style={"text-align": "justify"}),
            dcc.Graph(figure=fig_speed),
            html.P([
                html.Span("Accidents by Speed Limit: ", style={"font-weight": "bold"}),
                "This was one of the most intruiging graphs because speed is often a factor in accidents. This graph shows that roads with a speed limit of 35mph lead the count of accidents. We hypothesize that this is due to people speeding on roads with this limit and not following the rules. If there are drivers doing the limit, and there is a driver driving recklessly, an accident can be caused due the increase in speed. Our hypothesis can be confirmed by looking at the end of the graph. Roads with 70mph have the lowest amount of accident counts. This supports our hypothesis because in the 70mph roads, everyone is going at a higher rate of speed consistently, decreasing the chance of accident. However, for the roads with a lower limit, some drivers may be impatient and decide to go above posted speed, which can cause a crash with drivers following the posted speed."
            ], style={"text-align": "justify"}),
            dcc.Graph(figure=fig_weather),
            html.P([
                html.Span("Accidents by Weather: ", style={"font-weight": "bold"}),
                "This graph was surprising because we assumed that more messier weather such as rain and snow would have the highest counts of accidents. However, clear weather, meaning when it's sunny, leads all weather in the count of accidents. This shows that weather is not much of a big influence on crashes as it may seem to be. Even cloudy comes in before snow, so weather may be on the low end of influence."
            ], style={"text-align": "justify"}),
            dcc.Graph(figure=fig_direction),
            html.P([
                html.Span("Accidents by Direction: ", style={"font-weight": "bold"}),
                "We considered analyzing this factor to see if a driver's direction could be an influence on accidents. North lead the count of accidents by a decent margin, being followed by East, then South and West. We are not sure what factor is leading to driver's traveling North having more accidents, but the analysis is intruiging nonetheless."
            ], style={"text-align": "justify"}),
            dcc.Graph(figure=fig_fault),
            html.P([
                html.Span("Accidents by Driver Fault: ", style={"font-weight": "bold"}),
                "Laslty, we analyzed the count of accidents based on whether the driver was deemed at fault or not. Surprisingly, the number of drivers that were not at fault is only about 4000 less accidents than the drivers that were at fault. Now, this analysis is a bit tricky because you have to take into consideration a driver being faulted for hitting another driver, who is not faulted. Therefore, the analysis is not that surprising because there could be another driver on the end of a driver at fault accident, which causes an increase in the number 'No'."
            ], style={"text-align": "justify"}),
            html.Br(),
            html.Label(
                "Conclusions"
            , style={"font-weight": "bold", "font-size": "1.5em"}),
            html.Ul([
                html.Li(
                    "Our final analysis is that accidents can be caused by various factors. We believe that the strongest factors are the hour of day, speed limit of the road, and direction of travel. The hour of day can depit the level of traffic. An increase in traffice can lead to more accidents. Lower speed limits can be ignored by drivers at times. If an area has a low rate of speed, the driver may drive faster to get through the area quicker. However, their speed now becomes an outlier to the drivers that are doing the posted speed, which can cause an increase in accidents. Lastly, direction can play a part in the number of accidents if many people are traveling in the same direction at once. Also, there are even cases of people crossing the opposite side of the road, causing head on collisions."
                , style={"text-align": "justify"}),
                html.Li(
                    "Nonetheless, we must remember that this data is only being sourced from Montgomery County, MD, a small subset of crash data worldwide. Further analysis and more data is needed to effectively conclude on what factors, if any, are big influences on accidents, or are certain factors combined a big influence on crashes."
                , style={"text-align": "justify"}),
            ]),
            html.Br(),
        ], style={"padding-left": "90px", "padding-right": "90px"})
    ], style={"padding-left": "10px", "padding-right": "10px"}
)

if __name__ == '__main__':
    app.run_server(debug=False, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off
 * Running on http://127.0.0.1:8050/ (Press CTRL+C to quit)
