# Basic Regression Analysis

Import the clenaed `.csv` file

In [9]:
import pandas as pd

clean_neighborhood_file = "cleaned_CombinedSummaryByNeighborhoodQuarter2022"
df = pd.read_csv("../data/%s.csv" %clean_neighborhood_file)
df.head()

Unnamed: 0,county,er_visits,electrical,garbage,rodent,safety,water_plumbing,violations
0,manhattan,0.136293,0.128527,0.255951,0.169087,0.365729,0.149849,0.418877
1,manhattan,0.133178,0.101881,0.204466,0.241701,0.214834,0.07116,0.523401
2,manhattan,0.106698,0.159875,0.245617,0.280083,0.250639,0.065328,0.49376
3,manhattan,0.121495,0.106583,0.246171,0.156639,0.26087,0.175537,0.297192
4,manhattan,0.16433,0.128527,0.243956,0.160788,0.286445,0.23149,0.302652


## Encode County Variable

Here, we will be considering `county` as a categorical variable and encode it using one-hot encoding technique.

In [10]:
one_hot_enc = pd.get_dummies(df['county'], dtype=int)
df_copy = df.join(one_hot_enc)
df_copy.drop('county', axis=1, inplace=True)
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 168 entries, 0 to 167
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   er_visits       168 non-null    float64
 1   electrical      168 non-null    float64
 2   garbage         168 non-null    float64
 3   rodent          168 non-null    float64
 4   safety          168 non-null    float64
 5   water_plumbing  168 non-null    float64
 6   violations      168 non-null    float64
 7   bronx           168 non-null    int64  
 8   brooklyn        168 non-null    int64  
 9   manhattan       168 non-null    int64  
 10  queens          168 non-null    int64  
 11  staten_island   168 non-null    int64  
dtypes: float64(7), int64(5)
memory usage: 15.9 KB


In [11]:
from helper_functions import split_train_test
import numpy as np

df_copy['stratify_col'] = df_copy['bronx'].astype(str) + df_copy['brooklyn'].astype(str) + df_copy['manhattan'].astype(str) + df_copy['queens'].astype(str) + df_copy['staten_island'].astype(str)
#print(df['stratify_col'].value_counts())

X_train, X_test, y_train, y_test = split_train_test(df_copy, target='er_visits', test_split_size=0.2, stratify_col_name='stratify_col')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(np.count_nonzero(X_train[:,10]))
print(np.count_nonzero(X_test[:,10]))
df_copy.head()

(134, 11) (134,)
(34, 11) (34,)
13
3


Unnamed: 0,er_visits,electrical,garbage,rodent,safety,water_plumbing,violations,bronx,brooklyn,manhattan,queens,staten_island
0,0.136293,0.128527,0.255951,0.169087,0.365729,0.149849,0.418877,0,0,1,0,0
1,0.133178,0.101881,0.204466,0.241701,0.214834,0.07116,0.523401,0,0,1,0,0
2,0.106698,0.159875,0.245617,0.280083,0.250639,0.065328,0.49376,0,0,1,0,0
3,0.121495,0.106583,0.246171,0.156639,0.26087,0.175537,0.297192,0,0,1,0,0
4,0.16433,0.128527,0.243956,0.160788,0.286445,0.23149,0.302652,0,0,1,0,0


### LinearRegression

In [12]:
from sklearn.linear_model import LinearRegression
from helper_functions import model_metrics

county_lr = LinearRegression()
county_lr.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(county_lr, X_test, y_test)

Metrics for Test dataset:
LinearRegression() Mean Absolute Error:  0.0770381669197566
LinearRegression() Mean Squared Error:  0.009613902317669726
LinearRegression() R Squared:  0.6434821730235402
LinearRegression() Adjusted R Squared:  0.4652232595353103


### Ridge

In [13]:
from sklearn.linear_model import Ridge

county_rd = Ridge(alpha=1)
county_rd.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(county_rd, X_test, y_test)

Metrics for Test dataset:
Ridge(alpha=1) Mean Absolute Error:  0.07714754503077333
Ridge(alpha=1) Mean Squared Error:  0.009455373035301
Ridge(alpha=1) R Squared:  0.6493610048854324
Ridge(alpha=1) Adjusted R Squared:  0.47404150732814854


### RandomForest

In [14]:
from sklearn.ensemble import RandomForestRegressor

county_rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, oob_score=True)
county_rf.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(county_rf, X_test, y_test)

Metrics for Test dataset:
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Mean Absolute Error:  0.0721027779743047
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Mean Squared Error:  0.0095825947613646
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) R Squared:  0.6446431690034278
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Adjusted R Squared:  0.46696475350514166
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Out-Of-Bag Score:  0.7766385778432116


## No County Variable

Here, we will not include `county` variable for training the models

In [15]:
df_copy = df.drop('county', axis=1)

X_train, X_test, y_train, y_test = split_train_test(df_copy, target='er_visits', test_split_size=0.2, stratify_col_name=None)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
df_copy.head()

(134, 6) (134,)
(34, 6) (34,)


Unnamed: 0,er_visits,electrical,garbage,rodent,safety,water_plumbing,violations
0,0.136293,0.128527,0.255951,0.169087,0.365729,0.149849,0.418877
1,0.133178,0.101881,0.204466,0.241701,0.214834,0.07116,0.523401
2,0.106698,0.159875,0.245617,0.280083,0.250639,0.065328,0.49376
3,0.121495,0.106583,0.246171,0.156639,0.26087,0.175537,0.297192
4,0.16433,0.128527,0.243956,0.160788,0.286445,0.23149,0.302652


### LinearRegression

In [16]:
no_county_lr = LinearRegression()
no_county_lr.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_lr, X_test, y_test)

Metrics for Test dataset:
LinearRegression() Mean Absolute Error:  0.07809402896923792
LinearRegression() Mean Squared Error:  0.015396093645162279
LinearRegression() R Squared:  0.7093662694087213
LinearRegression() Adjusted R Squared:  0.6447809959439927


### Ridge

In [17]:
no_county_rd = Ridge(alpha=1)
no_county_rd.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_rd, X_test, y_test)

Metrics for Test dataset:
Ridge(alpha=1) Mean Absolute Error:  0.08327229321732027
Ridge(alpha=1) Mean Squared Error:  0.015955947164559198
Ridge(alpha=1) R Squared:  0.6987978537653085
Ridge(alpha=1) Adjusted R Squared:  0.6318640434909326


### RandomForest

In [18]:
no_county_rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42, oob_score=True)
no_county_rf.fit(X_train, y_train)

print("Metrics for Test dataset:")
model_metrics(no_county_rf, X_test, y_test)

Metrics for Test dataset:
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Mean Absolute Error:  0.07365905915580828
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Mean Squared Error:  0.012168291359175746
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) R Squared:  0.770297843456538
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Adjusted R Squared:  0.7192529197802131
RandomForestRegressor(max_depth=5, oob_score=True, random_state=42) Out-Of-Bag Score:  0.7799892686083519
