In [118]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
from sklearn.neighbors import NearestNeighbors

## Weather

In [119]:
weather = pd.read_csv('/Users/annaywj/Desktop/SDSU/BDA600/Capstone/sd_weather_2018_2024_combined.csv')
crash = pd.read_csv('/Users/annaywj/Desktop/SDSU/BDA600/Capstone/TIMS_SD_Crashes2013-2024.csv')

In [120]:
weather['datetime'] = pd.to_datetime(weather['datetime'], errors='coerce')
crash['COLLISION_DATE'] = pd.to_datetime(crash['COLLISION_DATE'], errors='coerce')

# Filter crash data for 2018–2024
crash_filtered = crash[(crash['COLLISION_DATE'].dt.year >= 2018) &
                          (crash['COLLISION_DATE'].dt.year <= 2024)].copy()

# Aggregate crash data by date
daily_crashes = crash_filtered.groupby('COLLISION_DATE').agg(
    TOTAL_CRASHES=('CASE_ID', 'count'),
    AVG_SEVERITY=('COLLISION_SEVERITY', 'mean')
).reset_index().rename(columns={'COLLISION_DATE': 'datetime'})

# Merge with weather data
merged_weather = pd.merge(weather, daily_crashes, on='datetime', how='inner')

# Select weather predictors and target
weather_features = ['humidity', 'cloudcover', 'windspeed', 'precip']
target_column = 'AVG_SEVERITY'

# Drop missing values
regression_ready = merged_weather[weather_features + [target_column]].dropna()

In [121]:
X = regression_ready[weather_features]
y = regression_ready[target_column]

# Add constant for OLS regression
X_ols = sm.add_constant(X)
ols_model = sm.OLS(y, X_ols).fit()

# Random Forest Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred)
rf_importance = pd.Series(rf_model.feature_importances_, index=weather_features).sort_values(ascending=False)

# Output results
ols_summary = ols_model.summary()
rf_r2, rf_importance, ols_summary.tables[1]

(-0.10411692103061565,
 cloudcover    0.359832
 humidity      0.321860
 windspeed     0.269387
 precip        0.048921
 dtype: float64,
 <class 'statsmodels.iolib.table.SimpleTable'>)

## SOC

In [122]:
road = pd.read_csv('/Users/annaywj/Downloads/SOC_-_Local_Roads__Speed_and_Volume_20250423.csv')

In [123]:

crash['COLLISION_DATE'] = pd.to_datetime(crash['COLLISION_DATE'], errors='coerce')
crash_filtered = crash[(crash['COLLISION_DATE'].dt.year >= 2018) & 
                       (crash['COLLISION_DATE'].dt.year <= 2024)].copy()

# Drop rows without location
crash_filtered = crash_filtered.dropna(subset=['POINT_X', 'POINT_Y'])

# Convert to GeoDataFrame
crash_gdf = gpd.GeoDataFrame(
    crash_filtered,
    geometry=gpd.points_from_xy(crash_filtered['POINT_X'], crash_filtered['POINT_Y']),
    crs="EPSG:4326"
)

# Step 2: Convert road data to GeoDataFrame
road = road.dropna(subset=['geometry'])
road_gdf = gpd.GeoDataFrame(road, geometry=gpd.GeoSeries.from_wkt(road['geometry']), crs="EPSG:4326")

# Step 3: Project both to meters
crash_gdf = crash_gdf.to_crs(epsg=3857)
road_gdf = road_gdf.to_crs(epsg=3857)

# Step 4: Use midpoint of road segment
road_gdf['rep_point'] = road_gdf.geometry.representative_point()
road_gdf.set_geometry('rep_point', inplace=True)

# Step 5: Nearest neighbor match
crash_coords = np.array(list(zip(crash_gdf.geometry.x, crash_gdf.geometry.y)))
road_coords = np.array(list(zip(road_gdf.geometry.x, road_gdf.geometry.y)))

nn = NearestNeighbors(n_neighbors=1, radius=50)
nn.fit(road_coords)
distances, indices = nn.kneighbors(crash_coords)

within_50m = distances[:, 0] <= 50
crash_gdf = crash_gdf[within_50m]
matched_indices = indices[within_50m].flatten()

# Step 6: Merge
matched_roads_clean = road_gdf.reset_index().iloc[matched_indices].reset_index(drop=True).drop(columns=['geometry'])
joined_df = pd.concat([crash_gdf.reset_index(drop=True), matched_roads_clean], axis=1)

# Step 7: Aggregate
aggregated = joined_df.groupby('osm_id').agg(
    TOTAL_CRASHES=('CASE_ID', 'count'),
    AVG_SEVERITY=('COLLISION_SEVERITY', 'mean'),
    Lanes=('Lanes', 'first'),
    Speed_Limit_MPH=('Speed Limit MPH', 'first'),
    Speed_2022_MPH=('Speed 2022 MPH', 'first'),
    Speed_Change=('1 year Speed % change', 'first'),
    AADT_Change=('1 year AADT % change', 'first')
).dropna()

# Step 8: Regression
X = aggregated[['Lanes', 'Speed_Limit_MPH', 'Speed_2022_MPH', 'Speed_Change', 'AADT_Change']]
y = aggregated['AVG_SEVERITY']

X_ols = sm.add_constant(X)
ols_model = sm.OLS(y, X_ols).fit()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred)
rf_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Output
ols_summary = ols_model.summary()
rf_r2, rf_importance, ols_summary.tables[1]

(-0.10429588222092101,
 AADT_Change        0.309248
 Speed_2022_MPH     0.270044
 Speed_Change       0.259344
 Lanes              0.089369
 Speed_Limit_MPH    0.071995
 dtype: float64,
 <class 'statsmodels.iolib.table.SimpleTable'>)

## Ped_party & Ped_Victim

In [124]:
ped_parties = pd.read_csv('/Users/annaywj/Downloads/Ped_Parties.csv')
ped_victims = pd.read_csv('/Users/annaywj/Downloads/Ped_Victims.csv')

In [125]:
socio_party_merged = ped_parties.merge(
    crash[['CASE_ID', 'COLLISION_SEVERITY']],
    on='CASE_ID',
    how='left'
)

# Map AT_FAULT to binary
socio_party_merged['AT_FAULT_BINARY'] = socio_party_merged['AT_FAULT'].map({'Y': 1, 'N': 0})

# Step 2: Merge in victim data (victim-level table with PARTY + CRASH info)
socio_party_victim_merged = socio_party_merged.merge(
    ped_victims[['CASE_ID', 'PARTY_NUMBER', 'VICTIM_AGE', 'VICTIM_DEGREE_OF_INJURY']],
    on=['CASE_ID', 'PARTY_NUMBER'],
    how='left'
)

# Step 3: One-hot encode victim injury severity
socio_party_victim_merged = pd.get_dummies(
    socio_party_victim_merged,
    columns=['VICTIM_DEGREE_OF_INJURY'],
    prefix='INJURY',
    drop_first=True
)

# Step 4: Prepare feature set
model_features_victims = ['PARTY_AGE', 'VICTIM_AGE', 'AT_FAULT_BINARY'] + \
    [col for col in socio_party_victim_merged.columns if col.startswith('INJURY_')]

model_data_victims = socio_party_victim_merged[model_features_victims + ['COLLISION_SEVERITY']].dropna()

X = model_data_victims[model_features_victims]
y = model_data_victims['COLLISION_SEVERITY']

# Step 5: Run OLS
X_ols = sm.add_constant(X)
ols_model = sm.OLS(y, X_ols).fit()

# Step 6: Random Forest Regression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
rf_r2 = r2_score(y_test, y_pred)
rf_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Output model results
ols_summary = ols_model.summary()
rf_r2, rf_importance, ols_summary.tables[1]

(0.6957053396275799,
 INJURY_4.0         0.436046
 INJURY_1.0         0.228706
 INJURY_2.0         0.094561
 PARTY_AGE          0.090444
 VICTIM_AGE         0.089447
 INJURY_5.0         0.030886
 INJURY_7.0         0.014858
 AT_FAULT_BINARY    0.006772
 INJURY_3.0         0.004173
 INJURY_6.0         0.004107
 dtype: float64,
 <class 'statsmodels.iolib.table.SimpleTable'>)

## Final Model

In [126]:
weather_vars = ['cloudcover', 'humidity', 'windspeed']
weather_final = merged_weather[['datetime', 'AVG_SEVERITY'] + weather_vars].dropna()

# From road: aggregated per road segment with osm_id
road_vars = ['AADT_Change', 'Speed_2022_MPH', 'Speed_Change']
road_final = aggregated[road_vars + ['AVG_SEVERITY']].dropna()

# From party/victim: victim-level merged with CRASH info
party_victim_vars = ['PARTY_AGE', 'VICTIM_AGE', 'INJURY_1.0', 'INJURY_2.0', 'INJURY_4.0']
victim_final = socio_party_victim_merged[party_victim_vars + ['COLLISION_SEVERITY']].dropna()

# Rename severity columns to align
weather_final = weather_final.rename(columns={'AVG_SEVERITY': 'severity'})
road_final = road_final.rename(columns={'AVG_SEVERITY': 'severity'})
victim_final = victim_final.rename(columns={'COLLISION_SEVERITY': 'severity'})

# Step 2: Create separate models for each (different levels — can't merge directly)
X_weather = weather_final[weather_vars]
X_road = road_final[road_vars]
X_victim = victim_final[party_victim_vars]
y_weather = weather_final['severity']
y_road = road_final['severity']
y_victim = victim_final['severity']

# Step 3: Train Random Forest models on each
def train_rf_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    rf = RandomForestRegressor(random_state=42)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    importance = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
    return r2, importance

r2_weather, imp_weather = train_rf_model(X_weather, y_weather)
r2_road, imp_road = train_rf_model(X_road, y_road)
r2_victim, imp_victim = train_rf_model(X_victim, y_victim)

r2_weather, imp_weather, r2_road, imp_road, r2_victim, imp_victim

(-0.11099561591030516,
 cloudcover    0.376267
 humidity      0.338181
 windspeed     0.285551
 dtype: float64,
 -0.1329219627770124,
 AADT_Change       0.382801
 Speed_Change      0.312471
 Speed_2022_MPH    0.304728
 dtype: float64,
 0.648014207470236,
 INJURY_4.0    0.480172
 INJURY_1.0    0.251844
 INJURY_2.0    0.104129
 PARTY_AGE     0.084259
 VICTIM_AGE    0.079596
 dtype: float64)

In [127]:
print(joined_df.columns.tolist())

['CASE_ID', 'ACCIDENT_YEAR', 'PROC_DATE', 'JURIS', 'COLLISION_DATE', 'COLLISION_TIME', 'OFFICER_ID', 'REPORTING_DISTRICT', 'DAY_OF_WEEK', 'CHP_SHIFT', 'POPULATION', 'CNTY_CITY_LOC', 'SPECIAL_COND', 'BEAT_TYPE', 'CHP_BEAT_TYPE', 'CITY_DIVISION_LAPD', 'CHP_BEAT_CLASS', 'BEAT_NUMBER', 'PRIMARY_RD', 'SECONDARY_RD', 'DISTANCE', 'DIRECTION', 'INTERSECTION', 'WEATHER_1', 'WEATHER_2', 'STATE_HWY_IND', 'CALTRANS_COUNTY', 'CALTRANS_DISTRICT', 'STATE_ROUTE', 'ROUTE_SUFFIX', 'POSTMILE_PREFIX', 'POSTMILE', 'LOCATION_TYPE', 'RAMP_INTERSECTION', 'SIDE_OF_HWY', 'TOW_AWAY', 'COLLISION_SEVERITY', 'NUMBER_KILLED', 'NUMBER_INJURED', 'PARTY_COUNT', 'PRIMARY_COLL_FACTOR', 'PCF_CODE_OF_VIOL', 'PCF_VIOL_CATEGORY', 'PCF_VIOLATION', 'PCF_VIOL_SUBSECTION', 'HIT_AND_RUN', 'TYPE_OF_COLLISION', 'MVIW', 'PED_ACTION', 'ROAD_SURFACE', 'ROAD_COND_1', 'ROAD_COND_2', 'LIGHTING', 'CONTROL_DEVICE', 'CHP_ROAD_TYPE', 'PEDESTRIAN_ACCIDENT', 'BICYCLE_ACCIDENT', 'MOTORCYCLE_ACCIDENT', 'TRUCK_ACCIDENT', 'NOT_PRIVATE_PROPERTY', '

In [128]:
joined_df = joined_df.rename(columns={
    'Speed 2022 MPH': 'Speed_2022_MPH',
    '1 year Speed % change': 'Speed_Change',
    '1 year AADT % change': 'AADT_Change'
})

# Extract road features by CASE_ID
road_features = ['AADT_Change', 'Speed_Change', 'Speed_2022_MPH', 'Lanes']
crash_road = joined_df[['CASE_ID'] + road_features]

# --- 2. Extract weather features ---
weather_vars = ['cloudcover', 'humidity', 'windspeed', 'precip']
weather_ready = merged_weather[['datetime'] + weather_vars].dropna()
crash_weather = crash_filtered.merge(weather_ready, left_on='COLLISION_DATE', right_on='datetime', how='left')

# --- 3. Merge crash with road and weather features ---
crash_enriched = crash_weather.merge(crash_road, on='CASE_ID', how='left')

# --- 4. Merge with party & victim data ---
final_data = socio_party_victim_merged.merge(
    crash_enriched[['CASE_ID'] + weather_vars + road_features],
    on='CASE_ID',
    how='left'
)

# --- 5. Select top features ---
selected_features = [
    'PARTY_AGE', 'VICTIM_AGE', 'INJURY_1.0', 'INJURY_2.0', 'INJURY_4.0',
    'cloudcover', 'humidity', 'windspeed', 'Lanes', 'precip',
    'AADT_Change', 'Speed_Change', 'Speed_2022_MPH', 'COLLISION_SEVERITY'
]

final_data_model = final_data[selected_features].dropna()

In [129]:
X_final = final_data_model.drop(columns='COLLISION_SEVERITY')
y_final = final_data_model['COLLISION_SEVERITY']

X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.3, random_state=42)
rf_final = RandomForestRegressor(random_state=42)
rf_final.fit(X_train, y_train)
y_pred = rf_final.predict(X_test)

# --- 7. Evaluate ---
r2_final = r2_score(y_test, y_pred)
feature_importance = pd.Series(rf_final.feature_importances_, index=X_final.columns).sort_values(ascending=False)

# --- 8. Output results ---
print(f"Final Model R²: {r2_final:.4f}")
print("\nFeature Importances:")
print(feature_importance)

Final Model R²: 0.7582

Feature Importances:
INJURY_4.0        0.394914
INJURY_1.0        0.201598
INJURY_2.0        0.093922
cloudcover        0.044772
Speed_2022_MPH    0.042501
humidity          0.041892
windspeed         0.037340
Speed_Change      0.035669
AADT_Change       0.034559
VICTIM_AGE        0.026779
PARTY_AGE         0.024392
Lanes             0.014395
precip            0.007266
dtype: float64


In [133]:
final_data_model = socio_party_victim_merged.merge(
    crash_enriched[['CASE_ID', 'cloudcover', 'humidity', 'windspeed', 'AADT_Change', 'Speed_Change', 'Speed_2022_MPH', 'Lanes', 'precip']],
    on='CASE_ID',
    how='left'
)

# Filter only relevant columns + keep CASE_ID!
final_data_model = final_data_model[[
    'CASE_ID',  # <== add this!
    'PARTY_AGE', 'VICTIM_AGE', 'INJURY_1.0', 'INJURY_2.0', 'INJURY_4.0',
    'cloudcover', 'humidity', 'windspeed', 'Lanes', 'precip',
    'AADT_Change', 'Speed_Change', 'Speed_2022_MPH',
    'COLLISION_SEVERITY'
]].dropna()


In [134]:
features_updated = [col for col in final_data_model.columns if col not in ['COLLISION_SEVERITY', 'severity_log']]
X = final_data_model[features_updated].dropna()
y = final_data_model.loc[X.index, 'COLLISION_SEVERITY']

In [136]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf_enriched = RandomForestRegressor(random_state=42)
rf_enriched.fit(X_train, y_train)
y_pred = rf_enriched.predict(X_test)

r2_enriched = r2_score(y_test, y_pred)
importances_enriched = pd.Series(rf_enriched.feature_importances_, index=X.columns).sort_values(ascending=False)

print(f"Enriched RF Model R²: {r2_enriched:.4f}")
print("\nTop Feature Importances:")
importances_enriched

Enriched RF Model R²: 0.7720

Top Feature Importances:


INJURY_4.0        0.394914
INJURY_1.0        0.201598
INJURY_2.0        0.093922
CASE_ID           0.053137
cloudcover        0.036665
Speed_2022_MPH    0.034500
humidity          0.034384
windspeed         0.032328
Speed_Change      0.029764
AADT_Change       0.029034
VICTIM_AGE        0.023205
PARTY_AGE         0.019263
Lanes             0.011645
precip            0.005641
dtype: float64

In [137]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Remove 'CASE_ID' before calculating VIF
X_vif = X.drop(columns=['CASE_ID'])

# Add constant for intercept
X_vif_const = sm.add_constant(X_vif)

# Calculate VIF for each feature
vif_data = pd.DataFrame()
vif_data['Feature'] = X_vif_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_vif_const.values, i) for i in range(X_vif_const.shape[1])]

vif_data.sort_values(by='VIF', ascending=False)


Unnamed: 0,Feature,VIF
0,const,73.936267
6,cloudcover,1.366702
7,humidity,1.359378
2,VICTIM_AGE,1.275914
1,PARTY_AGE,1.271574
10,precip,1.238299
8,windspeed,1.204352
13,Speed_2022_MPH,1.102314
5,INJURY_4.0,1.091316
11,AADT_Change,1.08175


In [140]:
# Re-run just the Random Forest Classifier on the existing final_data_model
# Convert COLLISION_SEVERITY into ordered classes
final_data_model['severity_class'] = pd.cut(
    final_data_model['COLLISION_SEVERITY'],
    bins=[0, 1.5, 2.5, 3.5, 4.5, 5.5],
    labels=[1, 2, 3, 4, 5]
).astype(int)

# Prepare features and target (exclude target columns)
X_rf = final_data_model.drop(columns=['COLLISION_SEVERITY', 'severity_class'], errors='ignore')
y_rf = final_data_model['severity_class']

# Train-test split
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X_rf, y_rf, test_size=0.3, random_state=42)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_rf, y_train_rf)
rf_preds = rf_model.predict(X_test_rf)

# Evaluation
rf_accuracy = accuracy_score(y_test_rf, rf_preds)
rf_report = classification_report(y_test_rf, rf_preds)

from sklearn.metrics import classification_report

# Pretty print the formatted classification report using Random Forest results
print("✅ Random Forest Classification Report (Severity Prediction)\n")
print(f"Accuracy: {rf_accuracy:.4f}\n")
print(classification_report(y_test_rf, rf_preds, digits=2))



✅ Random Forest Classification Report (Severity Prediction)

Accuracy: 0.8574

              precision    recall  f1-score   support

           1       1.00      0.82      0.90        88
           2       0.91      0.67      0.77       194
           3       0.79      0.96      0.86       504
           4       0.93      0.83      0.88       399

    accuracy                           0.86      1185
   macro avg       0.91      0.82      0.85      1185
weighted avg       0.87      0.86      0.86      1185

