# California Traffic Collision Data from SWITRS

Table `victims` contains information about the injuries of specific people involved in the collision.  

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import folium
from folium import plugins
from folium.plugins import HeatMap
import pygal 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.graph_objects as go
from pygal.style import Style
from IPython.display import display, HTML
plotly.offline.init_notebook_mode (connected = True)

## Import and View data

In [2]:
# Create a SQL connection to SQLite database
con = sqlite3.connect('/Users/sunhe/Desktop/NUS_Semester1/DSA5104/project/data/switrs.sqlite')

In [3]:
# check attributes in the table
victims = pd.read_sql_query(
    """
    SELECT * 
    FROM victims;
    """, con)

In [4]:
victims.shape

(9639334, 11)

In [5]:
victims.head()

Unnamed: 0,id,case_id,party_number,victim_role,victim_sex,victim_age,victim_degree_of_injury,victim_seating_position,victim_safety_equipment_1,victim_safety_equipment_2,victim_ejected
0,1,3858022,1,passenger,female,24.0,no injury,passenger seat 4,air bag not deployed,,not ejected
1,2,3858022,1,passenger,male,31.0,no injury,passenger seat 6,air bag not deployed,,not ejected
2,3,3899450,1,driver,female,72.0,complaint of pain,driver,air bag not deployed,lap/shoulder harness used,not ejected
3,4,3899453,2,driver,male,66.0,complaint of pain,driver,air bag deployed,lap/shoulder harness used,not ejected
4,5,3899453,3,passenger,male,65.0,no injury,passenger seat 3,air bag not deployed,lap/shoulder harness used,not ejected


## Analyze and visualize data

### How many victims were recorded?

In [17]:
# Number of victims recorded
df_total = pd.read_sql_query(
    """
    SELECT COUNT(case_id) AS Number_of_victims
    FROM victims;
    """, con)
df_total

Unnamed: 0,Number_of_victims
0,9639334


### Is victims' role relational to their degree of injury?

In [26]:
# victim_role vs victim_degree_of_injury
query = """
        SELECT victim_role, 
               victim_degree_of_injury  
        FROM victims
        LIMIT 100000;
        """
table = pd.read_sql_query(query, con)

In [1]:
# sns.swarmplot(x='victim_role',y='victim_degree_of_injury',data=table)

## Clean data

In [6]:
# drop duplicate values
victims = victims.drop_duplicates() 

In [7]:
# check missing values
victims.isnull().sum()

id                                 0
case_id                            0
party_number                       0
victim_role                       13
victim_sex                    239319
victim_age                    322754
victim_degree_of_injury            0
victim_seating_position        19477
victim_safety_equipment_1     550554
victim_safety_equipment_2    3104718
victim_ejected                 42865
dtype: int64

In [8]:
# How much percentage of values are missing from the total
missing = round((victims.isnull().sum() * 100) / victims.shape[0], 3)
missing_list_victims = []
for i in range(0, len(missing)):
    missing_list_victims.append(str(missing[i]) + ' %')
missing_victims = pd.DataFrame(data = missing_list_victims, index = missing.index, columns = ['missing percentage of victims'])
missing_victims

Unnamed: 0,missing percentage of victims
id,0.0 %
case_id,0.0 %
party_number,0.0 %
victim_role,0.0 %
victim_sex,2.483 %
victim_age,3.348 %
victim_degree_of_injury,0.0 %
victim_seating_position,0.202 %
victim_safety_equipment_1,5.712 %
victim_safety_equipment_2,32.209 %


In [9]:
# Drop missing values with a missing percentage of less than 10%
drop_index = []
for i in range(len(missing)):
    if missing[i] < 10:
        drop_index.append(missing.index[i])
drop_index

['id',
 'case_id',
 'party_number',
 'victim_role',
 'victim_sex',
 'victim_age',
 'victim_degree_of_injury',
 'victim_seating_position',
 'victim_safety_equipment_1',
 'victim_ejected']

In [14]:
new_victims = victims.dropna(subset=drop_index)

In [13]:
new_victims.shape

(8663495, 11)

In [15]:
new_victims.isnull().sum()

id                                 0
case_id                            0
party_number                       0
victim_role                        0
victim_sex                         0
victim_age                         0
victim_degree_of_injury            0
victim_seating_position            0
victim_safety_equipment_1          0
victim_safety_equipment_2    2521933
victim_ejected                     0
dtype: int64

In [16]:
# Save cleaned data
with open('/Users/sunhe/Desktop/NUS_Semester1/DSA5104/project/data/clean_data/clean_victims.csv',
          'a', encoding='utf8', newline="") as f:
    new_victims.to_csv(f, header=True, index=0)

## Generate data