In [30]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import r2_score, root_mean_squared_error

In [19]:
# setup and feature engineering
df = pd.read_csv('../../data/processed/neighborhood_crime_counts.csv')

# set neighborhood as index so removed from feature columns
df.set_index("neighborhood", inplace=True)

# select feature columns and scale
feature_cols = ['program_count', 'avgsocioeconscore', 'avgAccessToCareScore', 'avgMobidityScore', 'mortalityscore', 'avgBuiltEnvScore', 'avgOverallEquityScore', 'Total Population All', 'Percent White', 'Median Household Income', 'Per Capita Income', 'program_count_per1000' ]

# scale
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[feature_cols])

# scaled dataframe of features
X = pd.DataFrame(scaled_features, 
                         index=df.index, 
                         columns=feature_cols)

# target column - unscaled
target_col = 'crime_per1000'
y = df[target_col].values

In [28]:
X

Unnamed: 0_level_0,program_count,avgsocioeconscore,avgAccessToCareScore,avgMobidityScore,mortalityscore,avgBuiltEnvScore,avgOverallEquityScore,Total Population All,Percent White,Median Household Income,Per Capita Income,program_count_per1000
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Athmar Park,0.550280,-1.557425,-0.828804,-0.033408,-0.018134,-0.509720,-0.880229,-0.030248,-1.126164,-0.636021,-0.900917,0.749643
Auraria,-0.950504,1.517997,0.383322,-0.033408,1.396312,1.429703,1.424539,-1.204668,0.421023,1.524777,-1.548893,-0.703167
Baker,0.060291,-0.019714,0.787364,-0.684857,-0.725357,0.944847,0.090200,-0.378748,0.490537,0.192797,0.336427,0.616396
Barnum,0.356133,-1.557425,-0.828804,-0.033408,-0.725357,-0.994576,-1.244139,-0.455551,-1.570261,-0.962173,-1.281695,1.412845
Barnum West,-0.272532,-1.172997,-1.232846,-0.684857,-0.018134,-1.964287,-1.365443,-0.545636,-1.459651,-0.894328,-0.942041,0.280983
...,...,...,...,...,...,...,...,...,...,...,...,...
West Colfax,0.547198,-1.172997,-0.828804,-1.987755,-1.432580,-0.024864,-1.486747,0.191209,-0.091291,-0.123215,0.236879,0.429978
West Highland,-0.180081,0.749141,1.191406,0.618042,0.689089,-0.024864,0.939325,0.093185,1.121222,0.427337,0.708688,-0.363661
Westwood,1.126556,-1.557425,-1.232846,-0.033408,-0.725357,-0.509720,-1.244139,1.257354,-1.967887,-1.403705,-1.400905,0.074403
Whittier,-0.784093,-0.019714,-0.424762,1.269491,-0.725357,-0.024864,-0.152407,-0.616231,-0.064206,0.720695,0.156881,-0.860458


In [29]:
y

array([ 387.37008586, 3926.08089261, 1007.92171482,  459.19403996,
        304.61946232,  292.80821918,  300.06161429,  327.3822563 ,
       2629.88555388,  835.91811414,  605.86525936,  448.91552511,
        484.3540456 ,  633.94523958,  548.59370649,  854.2204202 ,
       2924.25320057,  545.61838252,  502.44553135,  461.64133739,
        324.07164701,  542.58600237,  255.08945096, 7270.09507347,
        800.46970409,  706.34684148, 1097.42347065,  185.4721006 ,
        223.31027871,  993.50811486,  520.85796318,  382.8125    ,
        391.4915428 ,  348.96518722,  255.57857826,  297.57514806,
        520.44807966,  236.43934279,  172.75      ,  679.31488801,
        524.89125181, 1229.6679866 ,  438.02019754,  281.86311218,
        186.28260152,  286.5186112 ,  432.37672427, 1016.44962303,
        174.13882638,  771.89477189,  893.08176101,  351.27778618,
        324.05063291,  337.63361592,  342.34868289,  308.68167203,
        339.90516596,  256.41025641,  340.33057851,  500.91804

## Basic Linear Regression without Regularization

In [27]:
lr_model = LinearRegression()
lr_model.fit(X, y)

# coefficients
lr_coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lr_model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print(lr_coef_df)

# model performance
lr_r2 = r2_score(y, lr_model.predict(X))
lr_rmse = root_mean_squared_error(y, lr_model.predict(X))

                    Feature   Coefficient
6     avgOverallEquityScore -1.035262e+09
4            mortalityscore  3.551386e+08
1         avgsocioeconscore  3.266705e+08
2      avgAccessToCareScore  3.108111e+08
5          avgBuiltEnvScore  2.590072e+08
3          avgMobidityScore  1.927717e+08
0             program_count -4.032010e+02
11    program_count_per1000  3.011419e+02
10        Per Capita Income -2.663602e+02
9   Median Household Income  1.359492e+02
8             Percent White -8.423316e+01
7      Total Population All  3.257139e+01


In [24]:
print(lr_r2)

0.29099332767040464


In [25]:
print(lr_rmse)

810.6715212738515


## Linear Regression with L1 Regularization

In [31]:
lasso = Lasso(alpha=0.1)
lasso.fit(X, y)

# coefficients
lasso_coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lasso.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print(lasso_coef_df)

# model performance
lasso_r2 = r2_score(y, lasso.predict(X))
lasso_rmse = root_mean_squared_error(y, lasso.predict(X))

                    Feature  Coefficient
2      avgAccessToCareScore  -554.862882
1         avgsocioeconscore   473.850401
0             program_count  -457.988719
11    program_count_per1000   306.695860
10        Per Capita Income  -304.368058
4            mortalityscore   257.466800
9   Median Household Income   180.997578
5          avgBuiltEnvScore   176.408100
8             Percent White  -125.827790
7      Total Population All    66.580590
3          avgMobidityScore   -36.655212
6     avgOverallEquityScore     0.000000


In [32]:
print(lasso_r2)

0.28106571565899396


In [33]:
print(lasso_rmse)

816.3273604570854


## Linear Regression with L2 Regularization

In [34]:
ridge = Ridge(alpha=0.1)
ridge.fit(X, y)

# coefficients
ridge_coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": ridge.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print(ridge_coef_df)

# model performance
ridge_r2 = r2_score(y, ridge.predict(X))
ridge_rmse = root_mean_squared_error(y, ridge.predict(X))

                    Feature  Coefficient
2      avgAccessToCareScore  -572.462730
0             program_count  -450.376056
1         avgsocioeconscore   445.265157
10        Per Capita Income  -302.965748
11    program_count_per1000   301.919922
4            mortalityscore   230.612885
9   Median Household Income   178.163940
5          avgBuiltEnvScore   156.621642
8             Percent White  -126.647240
6     avgOverallEquityScore    77.606224
7      Total Population All    60.717421
3          avgMobidityScore   -50.018858


In [35]:
print(ridge_r2)

0.281046555110147


In [36]:
print(ridge_rmse)

816.3382384861293


## Linear Regression with Elasticnet Regularization

In [37]:
elastic = ElasticNet(alpha=0.1, l1_ratio=0.1)
elastic.fit(X, y)

# coefficients
elastic_coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": elastic.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
print(elastic_coef_df)

# model performance
elastic_r2 = r2_score(y, elastic.predict(X))
elastic_rmse = root_mean_squared_error(y, elastic.predict(X))

                    Feature  Coefficient
2      avgAccessToCareScore  -336.900360
1         avgsocioeconscore   246.079291
10        Per Capita Income  -229.012409
0             program_count  -208.570197
4            mortalityscore   207.994776
11    program_count_per1000   147.060476
8             Percent White  -131.017465
5          avgBuiltEnvScore   128.445008
7      Total Population All  -108.679594
6     avgOverallEquityScore    79.805320
9   Median Household Income    64.506922
3          avgMobidityScore    -0.641860


In [38]:
print(elastic_r2)

0.2595425033948594


In [39]:
print(elastic_rmse)

828.4567154551079
