In [195]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, SplineTransformer
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel

In [196]:
import warnings
warnings.filterwarnings('ignore')

#### Read in CSV

In [197]:
df = pd.read_csv("Traffic_Crashes_Resulting_in_Injury_20240518.csv")
display(df.iloc[:, 3])
display(df.iloc[:, 15])

0        180085641
1        220672946
2        130379165
3        220527222
4          4395868
           ...    
59004    230392695
59005    230713386
59006    230347242
59007    230886886
59008    230728169
Name: case_id_pkey, Length: 59009, dtype: object

0        3801
1        3801
2        3801
3        3801
4        3801
         ... 
59004    3801
59005    3801
59006    3801
59007    3801
59008    3801
Name: juris, Length: 59009, dtype: object

Drop columns that are too numerous in category, unusable formats, or consisting of similar values.

In [198]:
df = df.drop(columns=['cnn_intrsctn_fkey', 'cnn_sgmt_fkey', 'case_id_pkey', 
                      'tb_latitude', 'tb_longitude', 'geocode_source', 
                      'officer_id', 'beat_number', 'direction', 
                      'control_device', 'vz_pcf_group', 'vz_pcf_description', 
                      'vz_pcf_link', 'number_killed', 'number_injured', 
                      'street_view', 'dph_col_grp', 'party_at_fault',
                      'party1_dir_of_travel', 'party1_move_pre_acc', 
                      'party2_dir_of_travel', 'party2_move_pre_acc', 
                      'point', 'data_as_of', 'data_updated_at', 
                      'data_loaded_at', 'Neighborhoods', 'SF Find Neighborhoods', 
                      'Current Police Districts', 'Current Supervisor Districts', 'Analysis Neighborhoods',
                      'reporting_district', "primary_rd", "secondary_rd", "unique_id", "juris",
                      'geocode_location', 'collision_date'])
df = df.dropna(axis="index", subset=["collision_time"])
df = df.dropna(axis="index", subset=["intersection"])
df = df.dropna(axis="index", subset=["distance"])

Reformat data to be consistent.

In [199]:

# display(df["road_cond_1"].unique())
# display(df["road_cond_2"].unique())
df["road_cond_1"].where(df["road_cond_1"] != "Holes, Deep Rut", other="Holes, Deep Ruts", inplace=True)
df["road_cond_1"].where(df["road_cond_1"] != "No Unusual Condition", other="None", inplace=True)
df["road_cond_1"].where(df["road_cond_1"] != "Not Stated", other="None", inplace=True)
df["road_cond_2"].where(df["road_cond_2"] != "No Unusual Condition", other="None", inplace=True)
df["road_cond_2"].where(df["road_cond_2"] != "Not Stated", other="None", inplace=True)
df["vz_pcf_code"].where(~df["vz_pcf_code"].isnull(), other="Unknown", inplace=True)
df['weather_1'].where(~df['weather_1'].str.contains("Fog"), other="Fog", inplace=True)
df['weather_1'].where(~df['weather_1'].isin([
    'Not Stated', 'Other: Unknown', "Other: NOT ON SCENE", 'Other: NOT AT SCENE'
]), other="Other", inplace=True)
df['weather_1'].where(df['weather_1'] != 'Other: MISTING', other="Mist", inplace=True)
df['weather_2'].where(~df['weather_2'].isin([
    np.nan, 
    'Not Stated', 
    'Other: NOT ON SCENE'])
    , other="Other", inplace=True)
df['weather_2'].where(~df['weather_2'].str.contains("Fog"), other="Fog", inplace=True)
df['weather_2'].where(~df['weather_2'].str.lower().str.contains("mist"), other="Mist", inplace=True)
df['weather_2'].where(~df['weather_2'].str.lower().str.contains("sun"), other="Sun", inplace=True)
df['weather_2'].where(~df['weather_2'].str.lower().str.contains("drizzl"), other="Drizzle", inplace=True)
df['weather_2'].where(~df['weather_2'].str.lower().str.contains("smoke"), other="Smokey", inplace=True)
df['weather_2'].where(df['weather_2'] != 'Other: FALLING ASH', other="Ash", inplace=True)
df['weather_2'].where(df['weather_2'] != 'Other: Overcast', other="Cloudy", inplace=True)
df['weather_2'].where(df['weather_2'] != 'Other: WET PAVEMENT', other="Wet", inplace=True)
df['weather_2'].where(df['weather_2'] != 'Other: Night', other="Night", inplace=True)
df["weather_comb"] = df["weather_1"] + ";" + df["weather_2"]
df["weather_comb"] = df["weather_comb"].str.split(";")
df = df.drop(columns=['weather_1', 'weather_2'])

# df["party2_type"].where(df["party2_type"])
# display(df["road_cond_1"].unique())
# display(df["road_cond_2"].unique())
df["road_cond_comb"] = df["road_cond_1"] + ";" + df["road_cond_2"]
df["road_cond_comb"] = df["road_cond_comb"].str.split(";")
df = df.drop(columns=['road_cond_1', 'road_cond_2'])
print(df.columns)
df['party1_type'].where((df['party1_type'].notna()), other="Other", inplace=True)
df['party1_type'].where((df['party1_type'] != 'Not Stated'), other="Other", inplace=True)
df['party1_type'].where((df['party1_type'] != np.nan), other="Other", inplace=True)
df['party2_type'].where((df['party2_type'].notna()), other="Other", inplace=True)
df['party2_type'].where((df['party2_type'] != 'Not Stated'), other="Other", inplace=True)
df['party2_type'].where((df['party2_type'] != np.nan), other="Other", inplace=True)

# df.loc[:, "juris"] = df.loc[:, "juris"].astype(str)
df["collision_datetime"] = pd.to_datetime(df["collision_datetime"], format= "%m/%d/%Y %H:%M:%S %p")
df.set_index("collision_datetime", inplace=True)
df.drop(columns=["collision_time"], inplace=True)
df.dropna(axis="rows", subset=['party1_type'])
display(df)

Index(['collision_datetime', 'collision_time', 'accident_year', 'month',
       'day_of_week', 'time_cat', 'distance', 'collision_severity',
       'type_of_collision', 'mviw', 'ped_action', 'road_surface', 'lighting',
       'intersection', 'vz_pcf_code', 'dph_col_grp_description', 'party1_type',
       'party2_type', 'weather_comb', 'road_cond_comb'],
      dtype='object')


Unnamed: 0_level_0,accident_year,month,day_of_week,time_cat,distance,collision_severity,type_of_collision,mviw,ped_action,road_surface,lighting,intersection,vz_pcf_code,dph_col_grp_description,party1_type,party2_type,weather_comb,road_cond_comb
collision_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-26 05:45:00,2018,January,Friday,2:01 pm to 6:00 pm,0.0,Injury (Complaint of Pain),Rear End,Other Motor Vehicle,Not Stated,Dry,Dark - Street Lights,Intersection <= 20ft,Unknown,Vehicle(s) Only Involved,Driver,Driver,"[Other, Other]","[None, None]"
2022-10-01 02:27:00,2022,October,Saturday,2:01 pm to 6:00 pm,121.0,Injury (Other Visible),Sideswipe,Bicycle,No Pedestrian Involved,Dry,Daylight,Midblock > 20ft,22350,Vehicle-Bicycle,Bicyclist,Driver,"[Clear, Other]","[None, None]"
2013-05-08 10:51:00,2013,May,Wednesday,10:01 pm to 2:00 am,0.0,Injury (Complaint of Pain),Vehicle/Pedestrian,Pedestrian,Crossing in Crosswalk at Intersection,Dry,Dark - Street Lights,Intersection <= 20ft,Unknown,Vehicle-Pedestrian,Driver,Pedestrian,"[Clear, Other]","[None, None]"
2022-08-07 10:18:00,2022,August,Sunday,10:01 pm to 2:00 am,228.0,Injury (Severe),Hit Object,Fixed Object,No Pedestrian Involved,Dry,Dark - Street Lights,Midblock > 20ft,Unknown,Bicycle Only,Bicyclist,Other,"[Clear, Other]","[Obstruction on Roadway, Other]"
2009-09-03 01:35:00,2009,September,Thursday,10:01 am to 2:00 pm,33.0,Injury (Other Visible),Rear End,Parked Motor Vehicle,No Pedestrian Involved,Dry,Daylight,Intersection Rear End <= 150ft,Unknown,Vehicle(s) Only Involved,Driver,Parked Vehicle,"[Clear, Other]","[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-07 02:10:00,2023,June,Wednesday,2:01 pm to 6:00 pm,0.0,Injury (Complaint of Pain),Head-On,Fixed Object,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,22350,Vehicle(s) Only Involved,Driver,Other,"[Clear, Other]","[None, None]"
2023-10-04 02:00:00,2023,October,Wednesday,10:01 am to 2:00 pm,0.0,Injury (Complaint of Pain),Broadside,Other Motor Vehicle,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,21453(a),Vehicle(s) Only Involved,Driver,Driver,"[Clear, Other]","[None, None]"
2023-05-19 09:40:00,2023,May,Friday,6:01 pm to 10:00 pm,0.0,Injury (Complaint of Pain),Broadside,Other Motor Vehicle,No Pedestrian Involved,Dry,Dark - Street Lights,Intersection <= 20ft,21801(a),Vehicle(s) Only Involved,Driver,Driver,"[Other, Other]","[None, None]"
2023-12-19 08:11:00,2023,December,Tuesday,6:01 pm to 10:00 pm,0.0,Injury (Complaint of Pain),Broadside,Motor Vehicle on Other Roadway,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,21801(a),Vehicle(s) Only Involved,Driver,Other,"[Clear, Other]","[None, None]"


In [200]:
df["month"] = pd.to_datetime(df["month"], format="%B").dt.month
df["month"].unique()

array([ 1, 10,  5,  8,  9, 11,  7, 12,  6,  4,  3,  2])

In [201]:
pd.to_datetime(pd.Series(df.index.format())).dt.weekday

0        4
1        5
2        2
3        6
4        3
        ..
58867    2
58868    2
58869    4
58870    1
58871    1
Length: 58872, dtype: int32

In [202]:
df['day_of_week'] = pd.to_datetime(pd.Series(df.index.format())).dt.weekday.to_numpy()
df['day_of_week']

collision_datetime
2018-01-26 05:45:00    4
2022-10-01 02:27:00    5
2013-05-08 10:51:00    2
2022-08-07 10:18:00    6
2009-09-03 01:35:00    3
                      ..
2023-06-07 02:10:00    2
2023-10-04 02:00:00    2
2023-05-19 09:40:00    4
2023-12-19 08:11:00    1
2023-10-10 03:25:00    1
Name: day_of_week, Length: 58872, dtype: int32

In [203]:
df['day_of_week'].unique()

array([4, 5, 2, 6, 3, 1, 0])

In [204]:
df.columns

Index(['accident_year', 'month', 'day_of_week', 'time_cat', 'distance',
       'collision_severity', 'type_of_collision', 'mviw', 'ped_action',
       'road_surface', 'lighting', 'intersection', 'vz_pcf_code',
       'dph_col_grp_description', 'party1_type', 'party2_type', 'weather_comb',
       'road_cond_comb'],
      dtype='object')

In [205]:
df["collision_severity"].unique()

array(['Injury (Complaint of Pain)', 'Injury (Other Visible)',
       'Injury (Severe)', 'Fatal', 'Medical'], dtype=object)

In [206]:
df["road_cond_comb"]

collision_datetime
2018-01-26 05:45:00                           [None, None]
2022-10-01 02:27:00                           [None, None]
2013-05-08 10:51:00                           [None, None]
2022-08-07 10:18:00        [Obstruction on Roadway, Other]
2009-09-03 01:35:00                           [None, None]
                                      ...                 
2023-06-07 02:10:00                           [None, None]
2023-10-04 02:00:00                           [None, None]
2023-05-19 09:40:00                           [None, None]
2023-12-19 08:11:00                           [None, None]
2023-10-10 03:25:00    [Construction or Repair Zone, None]
Name: road_cond_comb, Length: 58872, dtype: object

In [207]:
df.isnull().sum()

accident_year              0
month                      0
day_of_week                0
time_cat                   0
distance                   0
collision_severity         0
type_of_collision          0
mviw                       0
ped_action                 0
road_surface               0
lighting                   0
intersection               0
vz_pcf_code                0
dph_col_grp_description    0
party1_type                0
party2_type                0
weather_comb               0
road_cond_comb             0
dtype: int64

In [208]:
df["party1_type"].unique()

array(['Driver', 'Bicyclist', 'Pedestrian', 'Parked Vehicle', 'Other',
       'Bicycle'], dtype=object)

In [209]:
df

Unnamed: 0_level_0,accident_year,month,day_of_week,time_cat,distance,collision_severity,type_of_collision,mviw,ped_action,road_surface,lighting,intersection,vz_pcf_code,dph_col_grp_description,party1_type,party2_type,weather_comb,road_cond_comb
collision_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2018-01-26 05:45:00,2018,1,4,2:01 pm to 6:00 pm,0.0,Injury (Complaint of Pain),Rear End,Other Motor Vehicle,Not Stated,Dry,Dark - Street Lights,Intersection <= 20ft,Unknown,Vehicle(s) Only Involved,Driver,Driver,"[Other, Other]","[None, None]"
2022-10-01 02:27:00,2022,10,5,2:01 pm to 6:00 pm,121.0,Injury (Other Visible),Sideswipe,Bicycle,No Pedestrian Involved,Dry,Daylight,Midblock > 20ft,22350,Vehicle-Bicycle,Bicyclist,Driver,"[Clear, Other]","[None, None]"
2013-05-08 10:51:00,2013,5,2,10:01 pm to 2:00 am,0.0,Injury (Complaint of Pain),Vehicle/Pedestrian,Pedestrian,Crossing in Crosswalk at Intersection,Dry,Dark - Street Lights,Intersection <= 20ft,Unknown,Vehicle-Pedestrian,Driver,Pedestrian,"[Clear, Other]","[None, None]"
2022-08-07 10:18:00,2022,8,6,10:01 pm to 2:00 am,228.0,Injury (Severe),Hit Object,Fixed Object,No Pedestrian Involved,Dry,Dark - Street Lights,Midblock > 20ft,Unknown,Bicycle Only,Bicyclist,Other,"[Clear, Other]","[Obstruction on Roadway, Other]"
2009-09-03 01:35:00,2009,9,3,10:01 am to 2:00 pm,33.0,Injury (Other Visible),Rear End,Parked Motor Vehicle,No Pedestrian Involved,Dry,Daylight,Intersection Rear End <= 150ft,Unknown,Vehicle(s) Only Involved,Driver,Parked Vehicle,"[Clear, Other]","[None, None]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-07 02:10:00,2023,6,2,2:01 pm to 6:00 pm,0.0,Injury (Complaint of Pain),Head-On,Fixed Object,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,22350,Vehicle(s) Only Involved,Driver,Other,"[Clear, Other]","[None, None]"
2023-10-04 02:00:00,2023,10,2,10:01 am to 2:00 pm,0.0,Injury (Complaint of Pain),Broadside,Other Motor Vehicle,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,21453(a),Vehicle(s) Only Involved,Driver,Driver,"[Clear, Other]","[None, None]"
2023-05-19 09:40:00,2023,5,4,6:01 pm to 10:00 pm,0.0,Injury (Complaint of Pain),Broadside,Other Motor Vehicle,No Pedestrian Involved,Dry,Dark - Street Lights,Intersection <= 20ft,21801(a),Vehicle(s) Only Involved,Driver,Driver,"[Other, Other]","[None, None]"
2023-12-19 08:11:00,2023,12,1,6:01 pm to 10:00 pm,0.0,Injury (Complaint of Pain),Broadside,Motor Vehicle on Other Roadway,No Pedestrian Involved,Dry,Daylight,Intersection <= 20ft,21801(a),Vehicle(s) Only Involved,Driver,Other,"[Clear, Other]","[None, None]"


In [210]:
df.dtypes

accident_year                int64
month                        int32
day_of_week                  int32
time_cat                    object
distance                   float64
collision_severity          object
type_of_collision           object
mviw                        object
ped_action                  object
road_surface                object
lighting                    object
intersection                object
vz_pcf_code                 object
dph_col_grp_description     object
party1_type                 object
party2_type                 object
weather_comb                object
road_cond_comb              object
dtype: object

In [211]:
string_cols = list(df.columns[df.dtypes=="object"])
string_cols

['time_cat',
 'collision_severity',
 'type_of_collision',
 'mviw',
 'ped_action',
 'road_surface',
 'lighting',
 'intersection',
 'vz_pcf_code',
 'dph_col_grp_description',
 'party1_type',
 'party2_type',
 'weather_comb',
 'road_cond_comb']

In [212]:
for col in string_cols:
    print(col)
    df[col] = df[col].astype("string")

time_cat
collision_severity
type_of_collision
mviw
ped_action
road_surface
lighting
intersection
vz_pcf_code
dph_col_grp_description
party1_type
party2_type
weather_comb
road_cond_comb


In [213]:
df.dtypes

accident_year                       int64
month                               int32
day_of_week                         int32
time_cat                   string[python]
distance                          float64
collision_severity         string[python]
type_of_collision          string[python]
mviw                       string[python]
ped_action                 string[python]
road_surface               string[python]
lighting                   string[python]
intersection               string[python]
vz_pcf_code                string[python]
dph_col_grp_description    string[python]
party1_type                string[python]
party2_type                string[python]
weather_comb               string[python]
road_cond_comb             string[python]
dtype: object

The vast majority of our data is categorical. We will utilize OneHotEncoding for our normal features and periodic spline transformation for our cyclical time based features.

In [214]:
def periodic_spline_transformer(period, n_splines=None, degree=3):
    if n_splines is None:
        n_splines = period
    n_knots = n_splines + 1  # periodic and include_bias is True
    return SplineTransformer(
        degree=degree,
        n_knots=n_knots,
        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
        extrapolation="periodic",
        include_bias=True,
    )

In [215]:
column_trans = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), ["distance"]),
    ('encoder', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include="string")),
    ('cycle_month', periodic_spline_transformer(12, 6), ["month"]),
    ('cycle_weekday', periodic_spline_transformer(7, 3), ["day_of_week"])],
    remainder="passthrough")
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('classifier', KNeighborsClassifier())
])

In [216]:
y = df["collision_severity"]
y

collision_datetime
2018-01-26 05:45:00    Injury (Complaint of Pain)
2022-10-01 02:27:00        Injury (Other Visible)
2013-05-08 10:51:00    Injury (Complaint of Pain)
2022-08-07 10:18:00               Injury (Severe)
2009-09-03 01:35:00        Injury (Other Visible)
                                  ...            
2023-06-07 02:10:00    Injury (Complaint of Pain)
2023-10-04 02:00:00    Injury (Complaint of Pain)
2023-05-19 09:40:00    Injury (Complaint of Pain)
2023-12-19 08:11:00    Injury (Complaint of Pain)
2023-10-10 03:25:00    Injury (Complaint of Pain)
Name: collision_severity, Length: 58872, dtype: string

In [217]:
X = df.drop(columns=["collision_severity"])

In [218]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
pipe.fit(X_train, y_train)

In [219]:
scores = cross_val_score(pipe, X_train, y_train)
print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

CV Scores:  [0.60869565 0.61123083 0.60920269 0.61541387 0.61460446]
CV Average:  0.6118294994159583


In [None]:
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('classifier', SVC(kernel='linear'))
])
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train)


print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

In [None]:
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('classifier', SVC(kernel='rbf'))
])
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train)
print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

In [None]:
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('classifier', RandomForestClassifier())
])
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train)
print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

Perform some feature selection to see if it improves our results

In [None]:
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('feature_select', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
    ('classifier', RandomForestClassifier())
])

pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train)
print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

In [None]:
pipe = Pipeline(steps=[
    ('transformer', column_trans),
    ('feature_select', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
    ('classifier', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train)
print("CV Scores: ", scores)
print("CV Average: ", scores.mean())

In [None]:
tuning_params = [
    {'C': [i for i in range (0.5, 5.5, 0.5)]}
]