# Basic Regression Analysis

Import the clenaed `.csv` file

In [1]:
import pandas as pd

clean_neighborhood_file = "cleaned_CombinedSummaryByNeighborhoodQuarter2022"
df = pd.read_csv("../data/%s.csv" %clean_neighborhood_file)
df.head()

Unnamed: 0,county,er_visits,electrical,garbage,rodent,safety,water_plumbing,violations
0,manhattan,0.136293,0.128527,0.255951,0.169087,0.365729,0.149849,0.418877
1,manhattan,0.133178,0.101881,0.204466,0.241701,0.214834,0.07116,0.523401
2,manhattan,0.106698,0.159875,0.245617,0.280083,0.250639,0.065328,0.49376
3,manhattan,0.121495,0.106583,0.246171,0.156639,0.26087,0.175537,0.297192
4,manhattan,0.16433,0.128527,0.243956,0.160788,0.286445,0.23149,0.302652


## No County Variable

Here, we will not include `county` variable for training the models

In [2]:
from helper_functions import split_train_test
import numpy as np

df_copy = df.drop('county', axis=1)

X_train, X_test, y_train, y_test = split_train_test(df_copy, target='er_visits', test_split_size=0.2, stratify_col_name=None)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
df_copy.head()

(134, 6) (134,)
(34, 6) (34,)


Unnamed: 0,er_visits,electrical,garbage,rodent,safety,water_plumbing,violations
0,0.136293,0.128527,0.255951,0.169087,0.365729,0.149849,0.418877
1,0.133178,0.101881,0.204466,0.241701,0.214834,0.07116,0.523401
2,0.106698,0.159875,0.245617,0.280083,0.250639,0.065328,0.49376
3,0.121495,0.106583,0.246171,0.156639,0.26087,0.175537,0.297192
4,0.16433,0.128527,0.243956,0.160788,0.286445,0.23149,0.302652


### LinearRegression & GridSearchCV

In [3]:
best_models = []

In [4]:
from sklearn.linear_model import LinearRegression
from helper_functions import model_metrics

no_county_lr = LinearRegression()
no_county_lr.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_lr, X_test, y_test)

Metrics for Test dataset:
LinearRegression() Mean Absolute Error:  0.07809402896923792
LinearRegression() Mean Squared Error:  0.015396093645162279
LinearRegression() R Squared:  0.7093662694087213
LinearRegression() Adjusted R Squared:  0.6447809959439927


In [5]:
from helper_functions import gridsearch_crossval

params = {'fit_intercept':[True,False],
          'positive':[True,False]}
best_no_county_lr = gridsearch_crossval(LinearRegression(), params, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Best params for {best_no_county_lr}',best_no_county_lr.get_params())

best_models += [best_no_county_lr]

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END ..................fit_intercept=True, positive=True; total time=   0.0s
[CV] END ..................fit_intercept=True, positive=True; total time=   0.0s
[CV] END ..................fit_intercept=True, positive=True; total time=   0.0s
[CV] END .................fit_intercept=True, positive=False; total time=   0.0s
[CV] END .................fit_intercept=True, positive=False; total time=   0.0s
[CV] END .................fit_intercept=True, positive=False; total time=   0.0s
[CV] END .................fit_intercept=False, positive=True; total time=   0.0s
[CV] END .................fit_intercept=False, positive=True; total time=   0.0s
[CV] END .................fit_intercept=False, positive=True; total time=   0.0s
[CV] END .................fit_intercept=False, positive=True; total time=   0.0s
[CV] END .................fit_intercept=False, positive=True; total time=   0.0s
[CV] END ................fit_intercept=False, pos

### Lasso

In [6]:
from sklearn.linear_model import Lasso

no_county_ls = Lasso(alpha=1)
no_county_ls.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_ls, X_test, y_test)

Metrics for Test dataset:
Lasso(alpha=1) Mean Absolute Error:  0.20383137911322993
Lasso(alpha=1) Mean Squared Error:  0.05403500431432126
Lasso(alpha=1) R Squared:  -0.02002464055690001
Lasso(alpha=1) Adjusted R Squared:  -0.24669678290287766


In [7]:
params = {'alpha': [0.25, 0.5, 1, 1.5, 2],
          'fit_intercept': [True, False],
          'max_iter': [1000, 1500],
          'positive': [True, False],
          'random_state': [42],
          'selection': ['cyclic', 'random'],
          'tol': [0.0001, 0.001]}

best_no_county_ls = gridsearch_crossval(Lasso(), params, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Best params for {best_no_county_ls}',best_no_county_ls.get_params())

best_models += [best_no_county_ls]

Fitting 5 folds for each of 160 candidates, totalling 800 fits
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.001;

### Ridge

In [8]:
from sklearn.linear_model import Ridge

no_county_rd = Ridge(alpha=1)
no_county_rd.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_rd, X_test, y_test)

Metrics for Test dataset:
Ridge(alpha=1) Mean Absolute Error:  0.08327229321732027
Ridge(alpha=1) Mean Squared Error:  0.015955947164559198
Ridge(alpha=1) R Squared:  0.6987978537653085
Ridge(alpha=1) Adjusted R Squared:  0.6318640434909326


In [9]:
params = {'alpha': [0.25, 0.5, 1, 1.5, 2],
          'fit_intercept': [True, False],
          'max_iter': [1000, 1500],
          'positive': [True, False],
          'random_state': [42],
          'solver': ['auto'],
          'tol': [0.0001, 0.001]}

best_no_county_rd = gridsearch_crossval(Ridge(), params, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Best params for {best_no_county_rd}',best_no_county_rd.get_params())

best_models += [best_no_county_rd]

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, max_iter=1000, positive=True, random_state=42, solver=auto, tol=0.001; total time=   0.0s
[CV] END alpha=0

### ElasticNet

In [10]:
from sklearn.linear_model import ElasticNet

no_county_en = ElasticNet(alpha=1)
no_county_en.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_en, X_test, y_test)

Metrics for Test dataset:
ElasticNet(alpha=1) Mean Absolute Error:  0.20383137911322993
ElasticNet(alpha=1) Mean Squared Error:  0.05403500431432126
ElasticNet(alpha=1) R Squared:  -0.02002464055690001
ElasticNet(alpha=1) Adjusted R Squared:  -0.24669678290287766


In [11]:
params = {'alpha': [0.25, 0.5, 1, 1.5, 2],
          'fit_intercept': [True, False],
          'l1_ratio': [0.25, 0.5, 0.75],
          'max_iter': [1000, 1500],
          'positive': [True, False],
          'random_state': [42],
          'selection': ['cyclic', 'random'],
          'tol': [0.0001, 0.001]}

best_no_county_en = gridsearch_crossval(ElasticNet(), params, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print(f'Best params for {best_no_county_en}',best_no_county_en.get_params())

best_models += [best_no_county_en]

Fitting 5 folds for each of 480 candidates, totalling 2400 fits
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.0001; total time=   0.0s
[CV] END alpha=0.25, fit_intercept=True, l1_ratio=0.25, max_iter=1000, positive=True, random_state=42, selection=cyclic, tol=0.001; total time=   0.0s
[CV] END alpha=0.25, fit_

### GridSeaarchCV best Models

In [13]:
print("Best GridSearchCV model's metrics with test dataset:\n")
for model in best_models:
    print(f'{model}:')
    model_metrics(model, X_test, y_test)
    print('')

Best GridSearchCV model's metrics with test dataset:

LinearRegression(fit_intercept=False):
LinearRegression(fit_intercept=False) Mean Absolute Error:  0.07809399086483812
LinearRegression(fit_intercept=False) Mean Squared Error:  0.015396097937236956
LinearRegression(fit_intercept=False) R Squared:  0.7093661883867619
LinearRegression(fit_intercept=False) Adjusted R Squared:  0.6447808969171533

Lasso(alpha=0.25, positive=True, random_state=42):
Lasso(alpha=0.25, positive=True, random_state=42) Mean Absolute Error:  0.20383137911322993
Lasso(alpha=0.25, positive=True, random_state=42) Mean Squared Error:  0.05403500431432126
Lasso(alpha=0.25, positive=True, random_state=42) R Squared:  -0.02002464055690001
Lasso(alpha=0.25, positive=True, random_state=42) Adjusted R Squared:  -0.24669678290287766

Ridge(alpha=0.25, fit_intercept=False, max_iter=1000, random_state=42):
Ridge(alpha=0.25, fit_intercept=False, max_iter=1000, random_state=42) Mean Absolute Error:  0.07885928000594743
Ridg