In [20]:
import pandas as pd


merged_file_path = "./merged.csv"

merged = pd.read_csv(merged_file_path)

merged

Unnamed: 0,Neighborhood,Year,Median Rent,Median Income,RTI (%),Poverty Rate,Unemployment Rate,RTI_x_Unemployment,Population_x,Population_Growth_Rate,Evictions,Eviction Rate,Eviction Rate by Population
0,Astoria,2018,1736.89132,72043.60287,28.930668,13.941,4.4,127.294941,160871.0,-0.020995,280.0,0.003202,0.174053
1,Battery Park/Tribeca,2018,2743.13463,158255.79521,20.800259,6.625,3.7,76.960959,61375.7862,0.020117,49.0,0.001116,0.079836
2,Bay Ridge,2018,1457.02322,77339.73274,22.60711,15.868,3.6,81.385597,125200.0,0.013864,258.0,0.004955,0.20607
3,Bayside,2018,1889.64376,92226.15183,24.587088,6.093,3.5,86.054808,115744.0,-0.024657,55.0,0.001218,0.047519
4,Bedford Park,2018,1219.88308,36691.56211,39.896358,27.808,9.8,390.984308,133784.0,-0.067967,1585.0,0.030491,1.184746
5,Bedford Stuyvesant,2018,1381.18111,58746.66365,28.212961,27.036,5.8,163.635174,155117.0,0.092166,1266.0,0.017958,0.816158
6,Borough Park,2018,1525.38795,53998.51979,33.898439,27.846,4.2,142.373445,138570.0,-0.590987,335.0,0.005548,0.241755
7,Borough Park,2018,1525.38795,53998.51979,33.898439,27.846,4.2,142.373445,138570.0,-0.590987,335.0,0.005548,0.099361
8,Borough Park,2018,1525.38795,53998.51979,33.898439,27.846,4.2,142.373445,337152.7623,1.433086,335.0,0.005548,0.241755
9,Borough Park,2018,1525.38795,53998.51979,33.898439,27.846,4.2,142.373445,337152.7623,1.433086,335.0,0.005548,0.099361


In [21]:
X_simple = merged[['RTI_x_Unemployment']]
y = merged['Eviction Rate']

from sklearn.linear_model import LinearRegression
simple_model = LinearRegression()
simple_model.fit(X_simple, y)

print("Simple Model R²:", simple_model.score(X_simple, y))


Simple Model R²: 0.642205842709006


✅ Simple Model (RTI_x_Unemployment Only):
Metric	Value	Meaning
R² Score	0.61	✅ Strong solo predictor — this one feature explains 61% of variance alone!
✅ Confirms your earlier finding: RTI * Unemployment is the core driver.

In [23]:
# Year over Year rent change

In [24]:
merged.sort_values(by=['Neighborhood', 'Year'], inplace=True)
merged['YoY_Rent_Change'] = merged.groupby('Neighborhood')['Median Rent'].pct_change()

In [25]:
# elasticNet

In [26]:
from sklearn.linear_model import ElasticNetCV

features = ['RTI_x_Unemployment', 'YoY_Rent_Change']
X_elastic = merged[features].dropna()
y_elastic = merged.loc[X_elastic.index, 'Eviction Rate']

elastic = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], cv=5, random_state=42)
elastic.fit(X_elastic, y_elastic)

print("ElasticNet R² (Full Data):", elastic.score(X_elastic, y_elastic))
print("Best alpha (Regularization):", elastic.alpha_)
print("Coefficients:", dict(zip(features, elastic.coef_)))


ElasticNet R² (Full Data): 0.34023801828026434
Best alpha (Regularization): 0.0001525273647132282
Coefficients: {'RTI_x_Unemployment': np.float64(4.562997392812635e-05), 'YoY_Rent_Change': np.float64(0.0)}


ElasticNet Results (with new features):
Metric	Value	Meaning
ElasticNet Full Data R²	0.60	Slightly below the simple model, regularization shrinks other effects
Best Alpha (Penalty Strength)	20.77	Pretty strong regularization
Coefficients:	Only RTI_x_Unemployment survives (YoY Rent Change = 0)

In [27]:
from sklearn.model_selection import cross_val_score
elastic_scores = cross_val_score(elastic, X_elastic, y_elastic, cv=5)
print("ElasticNet Cross-Validated R²:", elastic_scores.mean())


ElasticNet Cross-Validated R²: -134.2401157275625


✅ ElasticNet Cross-Validated R²: 0.21
Meaning
🚀 Finally Positive Cross-Validation Score
🔎 ElasticNet improves generalization a bit
❌ But YoY Rent Change added no predictive power — got shrunk to zero

Lagged Eviction Rate

In [29]:
# Sort properly by Neighborhood and Year for lagging
merged.sort_values(by=['Neighborhood', 'Year'], inplace=True)

# Create lagged eviction rate
merged['Eviction_Rate_Lag1'] = merged.groupby('Neighborhood')['Eviction Rate'].shift(1)


In [30]:
merged['Poverty_x_Unemployment'] = merged['Poverty Rate'] * merged['Unemployment Rate']


In [31]:
features = ['RTI_x_Unemployment', 'Poverty_x_Unemployment', 'Eviction_Rate_Lag1']
X_final = merged[features].dropna()  # Drop rows where lag is NaN
y_final = merged.loc[X_final.index, 'Eviction Rate']


In [32]:
from sklearn.linear_model import ElasticNetCV

elastic_final = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.9], cv=5, random_state=42)
elastic_final.fit(X_final, y_final)

print("ElasticNet Final R² (Full Data):", elastic_final.score(X_final, y_final))
print("Best alpha:", elastic_final.alpha_)
print("Coefficients:", dict(zip(features, elastic_final.coef_)))


ElasticNet Final R² (Full Data): 0.3372791115569612
Best alpha: 0.014224753082283412
Coefficients: {'RTI_x_Unemployment': np.float64(4.141590381890095e-05), 'Poverty_x_Unemployment': np.float64(-0.0), 'Eviction_Rate_Lag1': np.float64(0.0)}


In [33]:
from sklearn.model_selection import cross_val_score
elastic_scores_final = cross_val_score(elastic_final, X_final, y_final, cv=5)
print("ElasticNet Final Cross-Validated R²:", elastic_scores_final.mean())


ElasticNet Final Cross-Validated R²: -91.02517035729369
