<a href="https://colab.research.google.com/github/tomheston/Cost-of-Living-Index-as-a-Primary-Driver-of-Homelessness/blob/main/randomforest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#################################################################
# The Cost of Living Index as a Primary Driver of Homelessness: #
# A Cross-State Regression Analysis                             #
# Thomas F. Heston, MD                                          #
# Data is publicly available on the Zenodo repository           #
#################################################################

# Import libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Read in data - note, you have to upload data manually to google colab
# you can get the data from Zenodo repository
#
df = pd.read_excel('20230919-DATA-ONLY.xlsx')

# Drop columns
# lines commented out are retained in the model
#df = df.drop('UNEMPLOYMENT', axis=1) # 1
#df = df.drop('COLI', axis=1) # 2
#df = df.drop('ETOHBINGE', axis=1) # 17
#df = df.drop('POVERTY', axis=1) # 8
#df = df.drop('TAXES', axis=1) # 15
#
# the 5 baskets of cost of living index dropped, and COLI kept
# adding back in any or all of the variables below degraded the model
#
df = df.drop('GROCERY', axis=1) # 3
df = df.drop('HOUSING', axis=1) # 4
df = df.drop('UTILITIES', axis=1) # 5
df = df.drop('TRANSPORTATION', axis=1) # 6
df = df.drop('HEALTH', axis=1) # 7
df = df.drop('GDP', axis=1) # 9
df = df.drop('DRUGODMORTALITY', axis=1) # 10
df = df.drop('INCOME', axis=1) # 11
df = df.drop('PRISONERS', axis=1) # 12
df = df.drop('GAS', axis=1) # 13
df = df.drop('GINI', axis=1) # 14
df = df.drop('HOUSINGBURDEN', axis=1) # 16
df = df.drop('OPIOIDRX', axis=1) # 18
df = df.drop('SMOKERS', axis=1) # 19
df = df.drop('HSGRAD', axis=1) # 20
df = df.drop('CIGEXCISE', axis=1) # 21
df = df.drop('ALCOHOL', axis=1) # 22
df = df.drop('PARTY', axis=1) # 23
df = df.drop('POP', axis=1) # 24
df = df.drop('SANCTUARY', axis=1) # 25

# Separate features and target
X = df.drop(['HOMELESSNESS'], axis=1)
y = df['HOMELESSNESS']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Instantiate model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit to training data
rf_model.fit(X_train, y_train)

# Evaluate on test data
test_pred = rf_model.predict(X_test)
test_r2 = rf_model.score(X_test, y_test)

# Extra evaluations
# Get feature importances
importances = rf_model.feature_importances_

# Get list of feature names
feature_names = X.columns

# Zip features and importances together
features_importances = zip(feature_names, importances)

# Sort by importance
sorted_features = sorted(features_importances, key=lambda x: x[1], reverse=True)

# Print ordered feature importances
for feature in sorted_features:
    print("{}: {}".format(feature[0], feature[1]))

# Print MSE
from sklearn.metrics import mean_squared_error
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# print metrics
print("MSE:", mse)
print('R-squared:', test_r2)



COLI: 0.6416878748492407
ETOHBINGE: 0.12226980912877938
UNEMPLOYMENT: 0.09106716374670316
TAXES: 0.08510614426317752
POVERTY: 0.059869008012099226
MSE: 102.092237154375
R-squared: 0.6549316049674789
