# California Traffic Collision Data from SWITRS

Table `parties` contains information about the groups people involved in the collision including age, sex, and sobriety. 

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import folium
from folium import plugins
from folium.plugins import HeatMap
import pygal 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
from pygal.style import Style
from IPython.display import display, HTML
plotly.offline.init_notebook_mode (connected = True)
import warnings
warnings.filterwarnings('ignore')

## Import and View data

In [2]:
# Create a SQL connection to SQLite database
con = sqlite3.connect('/Users/sunhe/Desktop/NUS_Semester1/DSA5104/project/data/switrs.sqlite')

In [3]:
# check attributes in the table
parties = pd.read_sql_query(
    """
    SELECT * 
    FROM parties;
    """, con)

In [4]:
parties.shape

(18669166, 32)

In [5]:
parties.head()

Unnamed: 0,id,case_id,party_number,party_type,at_fault,party_sex,party_age,party_sobriety,party_drug_physical,direction_of_travel,...,other_associate_factor_2,party_number_killed,party_number_injured,movement_preceding_collision,vehicle_year,vehicle_make,statewide_vehicle_type,chp_vehicle_type_towing,chp_vehicle_type_towed,party_race
0,1,81715,1,driver,1,female,35.0,not applicable,not applicable,north,...,,0,0,proceeding straight,2007.0,ford,,,,other
1,2,81715,2,driver,0,female,43.0,not applicable,not applicable,north,...,,0,0,proceeding straight,2019.0,,,,,hispanic
2,3,726202,1,driver,1,,,impairment unknown,G,north,...,,0,0,proceeding straight,2005.0,,passenger car,"passenger car, station",,
3,4,3858022,1,driver,1,male,47.0,impairment unknown,G,east,...,,0,0,,2007.0,dodge,emergency vehicle,police car,0.0,black
4,5,3858022,2,driver,0,male,58.0,had not been drinking,,east,...,,0,0,,2006.0,nissan,,,0.0,hispanic


## Analyze and visualize data

In [6]:
base_html = """ 
<!DOCTYPE html> 
<html> 
  <head> 
  <script type="text/javascript" src="http://kozea.github.com/pygal.js/javascripts/svg.jquery.js"></script> 
  <script type="text/javascript" src="https://kozea.github.io/pygal.js/2.0.x/pygal-tooltips.min.js""></script> 
  </head> 
  <body> 
    <figure> 
      {rendered_chart} 
    </figure> 
  </body> 
</html> 
""" 

### party_sex vs at_fault

In [7]:
data = pd.read_sql_query(
    """
    SELECT *
    FROM parties
    LIMIT 4000000
    """, con, parse_dates=["collision_date"])

In [8]:
female_at_fault=len(data[(data['party_sex']=='female') & (data['at_fault']==1)])/(len(data[data['party_sex']=='female']))

# Calculating the Female at Fault percentage
female_at_fault_perc=female_at_fault*100

male_at_fault=len(data[(data['party_sex']=='male') & (data['at_fault']==1)])/(len(data[data['party_sex']=='male']))
male_at_fault_perc=male_at_fault*100

female=[female_at_fault_perc,100-female_at_fault_perc]
male=[male_at_fault_perc,100-male_at_fault_perc]
line_chart = pygal.StackedBar()
line_chart.title = 'Male Vs Female Faults in Accidents (%)'
line_chart.x_labels = ['Female','Male']
line_chart.add('Percentage of Female at Fault', [female[0],None])
line_chart.add('Percentage Of male at Fault',      [None,male[0]])
display(HTML(base_html.format(rendered_chart=line_chart.render(is_unicode=True)))) 

### Age Distribution of People at fault

In [9]:
box_plot = pygal.Box()
box_plot.title = 'Age Vs At Fault Distribution'
box_plot.add('At Fault',data[data['at_fault']==1]['party_age'].dropna().values)
box_plot.add('Not At Fault',data[data['at_fault']==0]['party_age'].dropna().values)
display(HTML(base_html.format(rendered_chart=box_plot.render(is_unicode=True)))) 

### Does Directions Effect The Number Of Accidents？

In [10]:
direction=data.groupby('direction_of_travel').count()
x=direction['case_id'].values
t=[]
for i in range(len(x)):
    t.append(x[i]*100/len(data))

In [11]:
gauge = pygal.SolidGauge(
    half_pie=True, inner_radius=0.70,
    style=pygal.style.styles['default'](value_font_size=10))

percent_formatter = lambda x: '{:.10g}%'.format(x)
dollar_formatter = lambda x: '{:.10g}$'.format(x)
gauge.value_formatter = percent_formatter
for i in range(4):
    gauge.add(direction.index[i],[{'value':t[i],'max_value':100}])
display(HTML(base_html.format(rendered_chart=gauge.render(is_unicode=True))))

### What's the level of Sobriety of the people and their percentage to leading accidents?

In [12]:
sbr=data[data['at_fault']==1].groupby('party_sobriety').sum()
sbrr=sbr['party_number_injured'].values

t=[]
for i in sbrr:
    t.append(i*100/sum(sbrr))

pie_chart = pygal.Pie(inner_radius=.4)
pie_chart.title = 'Percentage of accidents leading to injuries caused by level of sobreity of people in fault'
for i in range(6):
    pie_chart.add(sbr.index[i],sbrr[i])
display(HTML(base_html.format(rendered_chart=pie_chart.render(is_unicode=True)))) 

## Clean data

In [13]:
# drop duplicate values
parties = parties.drop_duplicates()

In [14]:
# check missing values
parties.isnull().sum()

id                                     0
case_id                                0
party_number                           0
party_type                         43867
at_fault                               0
party_sex                        2499043
party_age                        2914711
party_sobriety                    556035
party_drug_physical             15645799
direction_of_travel               525943
party_safety_equipment_1         3151014
party_safety_equipment_2         7002624
financial_responsibility         1528462
hazardous_materials             18634366
cellphone_in_use                 5685173
cellphone_use_type               4322417
school_bus_related              18616718
oaf_violation_code              18589014
oaf_violation_category          17385334
oaf_violation_section           16418407
oaf_violation_suffix            18254828
other_associate_factor_1         1124874
other_associate_factor_2        18299563
party_number_killed                    0
party_number_inj

In [15]:
# How much percentage of values are missing from the total
missing = round((parties.isnull().sum() * 100) / parties.shape[0], 3)
missing_list_parties = []
for i in range(0, len(missing)):
    missing_list_parties.append(str(missing[i]) + ' %')
missing_parties = pd.DataFrame(data = missing_list_parties, index = missing.index, columns = ['missing percentage of parties'])
missing_parties

Unnamed: 0,missing percentage of parties
id,0.0 %
case_id,0.0 %
party_number,0.0 %
party_type,0.235 %
at_fault,0.0 %
party_sex,13.386 %
party_age,15.612 %
party_sobriety,2.978 %
party_drug_physical,83.806 %
direction_of_travel,2.817 %


In [16]:
# Drop missing values with a missing percentage of less than 10%
drop_index = []
for i in range(len(missing)):
    if missing[i] < 10:
        drop_index.append(missing.index[i])
drop_index

['id',
 'case_id',
 'party_number',
 'party_type',
 'at_fault',
 'party_sobriety',
 'direction_of_travel',
 'financial_responsibility',
 'other_associate_factor_1',
 'party_number_killed',
 'party_number_injured',
 'movement_preceding_collision',
 'vehicle_year']

In [17]:
new_parties = parties.dropna(subset=drop_index)

In [18]:
new_parties.shape

(14871828, 32)

In [19]:
new_parties.isnull().sum()

id                                     0
case_id                                0
party_number                           0
party_type                             0
at_fault                               0
party_sex                         881081
party_age                         952343
party_sobriety                         0
party_drug_physical             13590255
direction_of_travel                    0
party_safety_equipment_1         1133838
party_safety_equipment_2         4263015
financial_responsibility               0
hazardous_materials             14847154
cellphone_in_use                 3551637
cellphone_use_type               2468878
school_bus_related              14840427
oaf_violation_code              14802886
oaf_violation_category          13876310
oaf_violation_section           13114663
oaf_violation_suffix            14579440
other_associate_factor_1               0
other_associate_factor_2        14574908
party_number_killed                    0
party_number_inj

In [20]:
# Save cleaned data
with open('/Users/sunhe/Desktop/NUS_Semester1/DSA5104/project/data/clean_data/clean_parties.csv',
          'a', encoding='utf8', newline="") as f:
    new_parties.to_csv(f, header=True, index=0)