In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df = pd.read_csv('US_Accidents_March23.csv')

In [3]:
df = df[['Severity', 'Start_Time', 'State', 'Temperature(F)', 'Wind_Chill(F)',
       'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction',
       'Wind_Speed(mph)', 'Precipitation(in)', 'Amenity',
       'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
       'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
       'Turning_Loop']]

road_features = ['Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway',
                 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal',
                 'Turning_Loop']

def get_road_type(row):
    return ', '.join([feature for feature in road_features if row[feature]])

df['Type_of_Road'] = df[road_features].apply(get_road_type, axis=1).astype('category')
df = df[df['Type_of_Road'].notna() & (df['Type_of_Road'].str.strip() != '')]

In [18]:
df['Hour'] = pd.to_datetime(df['Start_Time'], errors='coerce').dt.hour
bool_cols = df.select_dtypes(include='bool').columns
df[bool_cols] = df[bool_cols].astype(int)
df_model = df.drop(columns=['Start_Time'])
df_model = df_model.drop(columns=road_features)
df_model = df_model.dropna()
df_model.head()

Unnamed: 0,Severity,State,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Amenity,Type_of_Road,Hour
14,2,OH,37.4,33.8,100.0,29.62,3.0,SSW,4.6,0.02,0,Traffic_Signal,8.0
22,2,OH,35.1,28.6,89.0,29.65,6.0,WSW,8.1,0.02,0,Traffic_Signal,11.0
26,2,OH,36.0,29.0,86.0,29.63,7.0,West,9.2,0.0,0,Traffic_Signal,12.0
27,2,OH,39.9,32.9,70.0,29.61,10.0,WNW,11.5,0.0,0,Traffic_Signal,14.0
30,2,OH,37.0,31.6,89.0,29.61,10.0,West,6.9,0.0,0,Traffic_Signal,16.0


In [None]:
len(df)

array([[2, Timestamp('2016-02-08 06:49:27'), 'OH', ..., False,
        'Traffic_Signal', 6.0],
       [2, Timestamp('2016-02-08 07:39:07'), 'OH', ..., False,
        'Traffic_Signal', 7.0],
       [3, Timestamp('2016-02-08 08:14:42'), 'OH', ..., False,
        'Crossing, Junction', 8.0],
       ...,
       [2, Timestamp('2019-08-23 17:10:58'), 'CA', ..., False,
        'Junction', 17.0],
       [2, Timestamp('2019-08-23 17:40:12'), 'CA', ..., False,
        'Junction', 17.0],
       [2, Timestamp('2019-08-23 19:00:21'), 'CA', ..., False,
        'Junction', 19.0]], dtype=object)

## Linear Regression

In [24]:
X = df_model.drop(columns = ['Severity'])
Y = df_model['Severity']

categorical_cols = ['State', 'Wind_Direction', 'Type_of_Road', 'Amenity']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X = X.astype(float)
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Severity,R-squared:,0.069
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,327.2
Date:,"Fri, 09 May 2025",Prob (F-statistic):,0.0
Time:,12:40:12,Log-Likelihood:,-673720.0
No. Observations:,1274811,AIC:,1348000.0
Df Residuals:,1274523,BIC:,1351000.0
Df Model:,287,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.9128,0.026,74.436,0.000,1.862,1.963
Temperature(F),-0.0012,0.000,-5.761,0.000,-0.002,-0.001
Wind_Chill(F),0.0010,0.000,5.188,0.000,0.001,0.001
Humidity(%),2.865e-05,2.16e-05,1.329,0.184,-1.36e-05,7.09e-05
Pressure(in),0.0049,0.001,6.719,0.000,0.003,0.006
Visibility(mi),-0.0007,0.000,-4.265,0.000,-0.001,-0.000
Wind_Speed(mph),0.0014,9.51e-05,15.080,0.000,0.001,0.002
Precipitation(in),0.0279,0.005,6.045,0.000,0.019,0.037
Hour,0.0029,7.09e-05,40.483,0.000,0.003,0.003

0,1,2,3
Omnibus:,585679.852,Durbin-Watson:,1.539
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4379467.554
Skew:,2.072,Prob(JB):,0.0
Kurtosis:,11.079,Cond. No.,7.66e+19


In [37]:
#without some perhaps collinear factors 
X = df_model.drop(columns = ['Severity', 'Wind_Chill(F)'])
Y = df_model['Severity']

categorical_cols = ['State', 'Wind_Direction', 'Type_of_Road', 'Amenity']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X = X.astype(float)
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()

In [27]:
df_model.head()

Unnamed: 0,Severity,State,Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Direction,Wind_Speed(mph),Precipitation(in),Amenity,Type_of_Road,Hour
14,2,OH,37.4,33.8,100.0,29.62,3.0,SSW,4.6,0.02,0,Traffic_Signal,8.0
22,2,OH,35.1,28.6,89.0,29.65,6.0,WSW,8.1,0.02,0,Traffic_Signal,11.0
26,2,OH,36.0,29.0,86.0,29.63,7.0,West,9.2,0.0,0,Traffic_Signal,12.0
27,2,OH,39.9,32.9,70.0,29.61,10.0,WNW,11.5,0.0,0,Traffic_Signal,14.0
30,2,OH,37.0,31.6,89.0,29.61,10.0,West,6.9,0.0,0,Traffic_Signal,16.0


In [34]:
df_model_test = df_model.drop(columns = ['State', 'Wind_Direction', 'Amenity', 'Type_of_Road'])

X = df_model_test.drop(columns = ['Severity'])
Y = df_model_test['Severity']

X = X.astype(float)
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Severity,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,1116.0
Date:,"Fri, 09 May 2025",Prob (F-statistic):,0.0
Time:,13:10:05,Log-Likelihood:,-714590.0
No. Observations:,1274811,AIC:,1429000.0
Df Residuals:,1274802,BIC:,1429000.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.0706,0.011,181.782,0.000,2.048,2.093
Temperature(F),0.0009,0.000,4.419,0.000,0.001,0.001
Wind_Chill(F),-0.0022,0.000,-11.521,0.000,-0.003,-0.002
Humidity(%),0.0004,2.01e-05,18.781,0.000,0.000,0.000
Pressure(in),0.0013,0.000,3.265,0.001,0.001,0.002
Visibility(mi),-0.0012,0.000,-7.112,0.000,-0.002,-0.001
Wind_Speed(mph),0.0021,7.9e-05,26.217,0.000,0.002,0.002
Precipitation(in),0.0344,0.005,7.246,0.000,0.025,0.044
Hour,0.0030,7.25e-05,41.385,0.000,0.003,0.003

0,1,2,3
Omnibus:,620321.131,Durbin-Watson:,1.498
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4622091.094
Skew:,2.23,Prob(JB):,0.0
Kurtosis:,11.193,Cond. No.,3540.0


In [35]:
#without wind direction
X = df_model.drop(columns = ['Severity', 'Wind_Direction'])
Y = df_model['Severity']

categorical_cols = ['State', 'Type_of_Road', 'Amenity']
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
X = X.astype(float)
X = sm.add_constant(X)

model = sm.OLS(Y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,Severity,R-squared:,0.068
Model:,OLS,Adj. R-squared:,0.068
Method:,Least Squares,F-statistic:,351.4
Date:,"Fri, 09 May 2025",Prob (F-statistic):,0.0
Time:,13:14:43,Log-Likelihood:,-674100.0
No. Observations:,1274811,AIC:,1349000.0
Df Residuals:,1274545,BIC:,1352000.0
Df Model:,265,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.8490,0.026,72.352,0.000,1.799,1.899
Temperature(F),-0.0009,0.000,-4.392,0.000,-0.001,-0.001
Wind_Chill(F),0.0007,0.000,3.649,0.000,0.000,0.001
Humidity(%),7.339e-05,2.12e-05,3.462,0.001,3.18e-05,0.000
Pressure(in),0.0069,0.001,9.561,0.000,0.005,0.008
Visibility(mi),-0.0011,0.000,-6.681,0.000,-0.001,-0.001
Wind_Speed(mph),0.0011,7.8e-05,13.472,0.000,0.001,0.001
Precipitation(in),0.0296,0.005,6.432,0.000,0.021,0.039
Hour,0.0029,7.09e-05,40.742,0.000,0.003,0.003

0,1,2,3
Omnibus:,585474.973,Durbin-Watson:,1.538
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4375331.779
Skew:,2.072,Prob(JB):,0.0
Kurtosis:,11.075,Cond. No.,1.7e+20


In [None]:
results = []

for col in df_model.columns:
    if col == 'Severity':
        continue

    X = df_model[[col]].copy()
    categorical_cols = ['State', 'Wind_Direction', 'Type_of_Road', 'Amenity']
    if col in categorical_cols:
        X = pd.get_dummies(X, columns=[col], drop_first=True)
    X = X.astype(float)
    X = sm.add_constant(X)
    y = df_model['Severity']

    model = sm.OLS(y, X).fit()
    summary = model.summary2().tables[1]
    summary['feature'] = col
    results.append(summary)

results

[             Coef.  Std.Err.           t          P>|t|    [0.025    0.975]  \
const     2.066901  0.003064  674.641447   0.000000e+00  2.060896  2.072906   
State_AR  0.072560  0.008843    8.205178   2.304597e-16  0.055228  0.089893   
State_AZ -0.140800  0.003620  -38.889792   0.000000e+00 -0.147896 -0.133704   
State_CA  0.008567  0.003186    2.688988   7.166999e-03  0.002323  0.014812   
State_CO  0.140680  0.004612   30.503380  2.786083e-204  0.131641  0.149719   
State_CT  0.257402  0.005697   45.183920   0.000000e+00  0.246236  0.268567   
State_DC  0.044670  0.006000    7.444398   9.744822e-14  0.032909  0.056431   
State_DE  0.108585  0.008219   13.211096   7.617268e-40  0.092475  0.124694   
State_FL -0.023697  0.003195   -7.417437   1.194816e-13 -0.029959 -0.017435   
State_GA  0.226118  0.004523   49.989773   0.000000e+00  0.217252  0.234983   
State_IA  0.212485  0.009250   22.970705  9.666909e-117  0.194355  0.230615   
State_ID -0.022569  0.008515   -2.650620   8.034523

## Cross Validation