In [29]:
import h5py
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge, ElasticNetCV, LassoCV, Lasso
from sklearn.model_selection import GridSearchCV, KFold, train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, RobustScaler, scale
from distfit import distfit
from sklearn.decomposition import PCA
from scipy.stats import boxcox 
import seaborn as sns 
from haversine import haversine

In [2]:
zcta_health = pd.read_csv("PLACES__Local_Data_for_Better_Health__ZCTA_Data_2021_release.csv", dtype={'LocationName':'str'})
zcta_health

Unnamed: 0,Year,LocationName,DataSource,Category,Measure,Data_Value_Unit,Data_Value_Type,Data_Value,Data_Value_Footnote_Symbol,Data_Value_Footnote,Low_Confidence_Limit,High_Confidence_Limit,TotalPopulation,Geolocation,LocationID,CategoryID,MeasureId,DataValueTypeID,Short_Question_Text
0,2019,01775,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,2.2,,,2.0,2.6,6590,POINT (-71.51145308 42.42983302),1775,HLTHOUT,STROKE,CrdPrv,Stroke
1,2019,02163,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,0.6,,,0.5,0.7,2582,POINT (-71.12091089 42.36632855),2163,HLTHOUT,STROKE,CrdPrv,Stroke
2,2019,02568,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,3.2,,,2.8,3.7,4000,POINT (-70.638878 41.4524563),2568,HLTHOUT,STROKE,CrdPrv,Stroke
3,2019,04071,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,2.7,,,2.3,3.0,4801,POINT (-70.44768205 43.93214994),4071,HLTHOUT,STROKE,CrdPrv,Stroke
4,2019,05640,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,3.6,,,3.0,4.3,161,POINT (-72.49322605 44.34525233),5640,HLTHOUT,STROKE,CrdPrv,Stroke
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
954320,2019,99208,BRFSS,Health Status,Fair or poor self-rated health status among ad...,%,Crude prevalence,17.2,,,16.4,17.9,49193,POINT (-117.4536785 47.73870779),99208,HLTHSTAT,GHLTH,CrdPrv,General Health
954321,2019,99762,BRFSS,Health Outcomes,Arthritis among adults aged >=18 years,%,Crude prevalence,21.0,,,20.1,21.7,4038,POINT (-165.364592 64.657639),99762,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis
954322,2019,99759,BRFSS,Health Outcomes,Chronic obstructive pulmonary disease among ad...,%,Crude prevalence,7.1,,,6.2,8.1,189,POINT (-162.8271995 69.73094168),99759,HLTHOUT,COPD,CrdPrv,COPD
954323,2019,99546,BRFSS,Health Status,Fair or poor self-rated health status among ad...,%,Crude prevalence,21.1,,,19.1,23.4,326,POINT (-176.6311696 51.84245743),99546,HLTHSTAT,GHLTH,CrdPrv,General Health


In [3]:
zcta_health = zcta_health.astype({'Year':'str', 'LocationName':'str'}).set_index('Year').filter(like="2019", axis=0)

In [4]:
zcta_health = zcta_health.pivot(columns='MeasureId', values='Data_Value', index=['LocationName']).reset_index()
zcta_health.rename(columns={'LocationName':'ZCTA5'}, inplace=True)

In [5]:
zip_to_tract = pd.read_csv("zcta_tract_rel_10.csv", dtype=str)
zip_to_tract.rename(columns={'GEOID':'TractFIPS'}, inplace=True)
zip_to_tract = zip_to_tract[['TractFIPS', 'ZCTA5']]

In [35]:
health_cols_raw = ['access2', 'arthritis', 'binge', 'bphigh', 'bpmed', 'cancer', 'casthma', 'cervical', 'chd', 'checkup', 'cholscreen', 'colon_screen', 'copd', 'corem', 'corew', 'csmoking', 'dental', 'depression', 'diabetes', 'ghlth', 'highchol', 'kidney', 'lpa', 'mammouse', 'mhlth', 'obesity', 'phlth', 'sleep', 'stroke', 'teethlost']
health_cols = [x.upper() +'_CrudePrev' for x in health_cols_raw]
desert_measures = ['closest_education_nodes_travel_time','closest_food_nodes_travel_time',
                   'closest_worship_nodes_travel_time', 'physical_dist', 'transport_dist']

In [6]:
d = {}
l = []
cols = []
with h5py.File('nearest_EM.hdf5', 'r') as h5:
    for k in h5.keys():
        d[k] = h5[k][:]
        l.append(h5[k][:])
        cols.append(k)

In [50]:
desert_df = pd.DataFrame(np.asarray(l).T, columns = cols)
desert_df = desert_df.astype({'TractFIPS':'int'}).astype({'TractFIPS':'str'})
desert_df = pd.merge(desert_df, zip_to_tract, on='TractFIPS')

In [51]:
with h5py.File(r'nearest_EM.hdf5', 'r') as f:
    nodes = f['nodes'][:]
    nodes_x = f['nodes_x'][:]
    nodes_y = f['nodes_y'][:]
    physical_proximity_x = f['closest_physical_nodes_x'][:]
    physical_proximity_y = f['closest_physical_nodes_y'][:]
    transport_proximity_x = f['closest_transport_nodes_x'][:]
    transport_proximity_y = f['closest_transport_nodes_y'][:]
smaller_bounds = (nodes_x > -74.4) & (nodes_x < -73.6) & (nodes_y < 41)
nodes = nodes[smaller_bounds]
nodes_x = nodes_x[smaller_bounds]
nodes_y = nodes_y[smaller_bounds]
physical_proximity_x = physical_proximity_x[smaller_bounds]
physical_proximity_y = physical_proximity_y[smaller_bounds]
transport_proximity_x = transport_proximity_x[smaller_bounds]
transport_proximity_y = transport_proximity_y[smaller_bounds]

In [52]:
def haversine_wrapper(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2))
physical_dist = np.zeros_like(nodes_x)
transport_dist = np.zeros_like(nodes_x)
for i in tqdm(range(len(nodes_x))):
    physical_dist[i] = haversine_wrapper(nodes_y[i], nodes_x[i], physical_proximity_y[i], physical_proximity_x[i])
    transport_dist[i] = haversine_wrapper(nodes_y[i], nodes_x[i], transport_proximity_y[i], transport_proximity_x[i])
desert_df['physical_dist'] = np.nan
desert_df['transport_dist'] = np.nan
num_missed = 0
for i in tqdm(desert_df.index, total=desert_df.shape[0]):
    try:
        idx = np.where(nodes == desert_df.at[i, 'nodes'])[0][0]
        desert_df.at[i, 'physical_dist'] = physical_dist[idx]
        desert_df.at[i, 'transport_dist'] = transport_dist[idx]
    except IndexError:
        num_missed += 1
print('Missed:', num_missed)

100%|████████████████████████████████████| 2117/2117 [00:00<00:00, 85945.19it/s]
100%|████████████████████████████████████| 2810/2810 [00:00<00:00, 38381.73it/s]

Missed: 0





In [53]:
desert_df = desert_df[desert_measures+['ZCTA5']]

Unnamed: 0,closest_education_nodes_travel_time,closest_food_nodes_travel_time,closest_worship_nodes_travel_time,physical_dist,transport_dist,ZCTA5
0,4.0,3.0,4.0,0.158174,0.498002,11220
1,7.0,3.0,7.0,0.377913,0.535944,11219
2,7.0,3.0,7.0,0.377913,0.535944,11220
3,9.0,7.0,9.0,0.797871,0.617204,11230
4,4.0,11.0,4.0,0.679142,0.333808,10469
...,...,...,...,...,...,...
2805,0.0,2.0,0.0,0.286232,0.000000,11229
2806,3.0,4.0,3.0,0.556792,0.324271,10019
2807,5.0,9.0,5.0,0.307807,0.339795,11433
2808,2.0,1.0,2.0,0.461220,0.160318,10017


In [54]:
desert_df = desert_df.groupby('ZCTA5', as_index=False).median()

Unnamed: 0,ZCTA5,closest_education_nodes_travel_time,closest_food_nodes_travel_time,closest_worship_nodes_travel_time,physical_dist,transport_dist
0,10001,2.0,1.0,2.0,0.416515,0.156657
1,10002,3.0,2.0,3.0,0.129459,0.155874
2,10003,3.0,3.0,3.0,0.439944,0.142336
3,10004,4.0,4.0,4.0,0.394331,0.148273
4,10005,2.0,4.0,2.0,0.641055,0.148273
...,...,...,...,...,...,...
206,11691,3.0,9.0,3.0,0.328309,0.287184
207,11692,2.0,6.0,2.0,0.223313,0.155681
208,11693,2.0,8.5,2.0,0.305888,0.117108
209,11694,1.0,13.0,1.0,0.486986,0.076193


In [55]:
nyc_health = pd.merge(zcta_health, desert_df, on='ZCTA5')

In [56]:
nyc_health

Unnamed: 0,ZCTA5,ACCESS2,ARTHRITIS,BINGE,BPHIGH,BPMED,CANCER,CASTHMA,CHD,CHECKUP,...,LPA,MHLTH,OBESITY,PHLTH,STROKE,closest_education_nodes_travel_time,closest_food_nodes_travel_time,closest_worship_nodes_travel_time,physical_dist,transport_dist
0,10001,9.5,16.1,24.2,21.1,68.2,5.2,9.4,3.9,78.5,...,18.4,12.1,18.3,8.8,2.1,2.0,1.0,2.0,0.416515,0.156657
1,10002,21.0,21.7,15.8,31.3,76.8,5.6,9.7,7.1,79.3,...,35.0,15.0,20.3,15.8,4.1,3.0,2.0,3.0,0.129459,0.155874
2,10003,8.0,13.8,27.5,17.1,64.8,4.8,9.4,2.9,77.3,...,15.2,12.4,16.4,7.2,1.5,3.0,3.0,3.0,0.439944,0.142336
3,10004,6.2,10.2,29.7,13.4,55.5,3.5,8.2,1.7,75.9,...,12.0,10.0,15.2,5.3,1.0,4.0,4.0,4.0,0.394331,0.148273
4,10005,6.9,7.1,31.7,10.0,41.5,2.3,8.7,1.1,73.9,...,12.0,11.6,14.5,4.8,0.6,2.0,4.0,2.0,0.641055,0.148273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,11691,20.4,21.4,14.8,33.6,76.4,5.8,10.7,6.5,82.1,...,35.8,15.4,31.5,15.1,4.2,3.0,9.0,3.0,0.328309,0.287184
180,11692,20.3,20.7,14.4,34.0,74.8,5.2,11.3,5.8,82.7,...,37.0,16.2,33.4,15.2,4.2,2.0,6.0,2.0,0.223313,0.155681
181,11693,15.3,21.0,16.5,29.7,75.3,6.2,9.8,5.5,81.2,...,30.7,14.2,28.2,13.3,3.3,2.0,8.5,2.0,0.305888,0.117108
182,11694,11.8,23.4,16.5,30.3,78.4,8.0,8.8,6.7,82.0,...,26.9,12.2,24.8,12.7,3.4,1.0,13.0,1.0,0.486986,0.076193


In [66]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_nan = pd.DataFrame({'Health condition': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())], 'Food': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())],
                       'Physical health': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())], 'Public transport': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())],
                        'Education': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())], 'Houses of worship': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())],
                       'RSquared': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())], 'MSE': [np.nan for c in nyc_health.columns if c.lower() in list(name_mapping.keys())]})

In [67]:
def update_results(results, model, i, X_test, y_test, name):
    results.iat[i, 0] = name
    results.iat[i, 1] = model.coef_[1]
    results.iat[i, 2] = model.coef_[3]
    results.iat[i, 3] = model.coef_[4]
    results.iat[i, 4] = model.coef_[0]
    results.iat[i, 5] = model.coef_[2]
    results.iat[i, 6] = model.score(X_test, y_test)

In [68]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in nyc_health.columns:
    if c.lower() in list(name_mapping.keys()):
        name = name_mapping[c.lower()]
        
        x = nyc_health[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = nyc_health[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        update_results(results, regr, i, X_test, y_test, name)
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_train = mean_squared_error(y_train, y_train_pred)
        results.iat[i,6] = r2_score(y_train, y_train_pred)
        results.iat[i, 7] = mse_train
        #make_resid_plot(predictions, X_test, y_test, name, i)
        
        #residuals = y_test - predictions
        #make_qq_plot(residuals, X_test, y_test, name, i)
        #make_htsk_plot(residuals, predictions, name, i)
            
        i += 1

#layout_resid_plot()
#plt.savefig('plots/all_deserts/elasticnet_resid.png', transparent=False, dpi=300)
results

Health insurance access
Arthritis prevalence
Binge drinking prevalence
High blood pressure prevalence
Medium blood pressure prevalence
Cancer prevalence
Asthma prevalence
Coronary heart disease prevalence
Routine checkups
Cholesterol screenings
COPD prevalence
Smoking prevalence
Depression prevalence
Diabetes prevalence
General poor health prevalence
High cholesterol prevalence
Chronic kidney disease
No physical activity
Poor mental health prevalence
Obesity prevalence
Poor physical health
Stroke prevalence


Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Health insurance access,0.922736,-0.534739,2.558502,-3.306512,-0.313381,0.05226,41.579563
1,Arthritis prevalence,0.814356,0.0,0.82151,0.0,0.0,0.211394,12.101733
2,Binge drinking prevalence,-1.043238,-0.505545,-1.244424,0.362685,0.000806,0.226415,13.931301
3,High blood pressure prevalence,1.193773,0.0,0.37919,0.0,0.0,0.110192,31.887436
4,Medium blood pressure prevalence,0.571968,0.0,1.05237,0.0,0.0,0.105879,34.362126
5,Cancer prevalence,0.017544,0.0,0.386489,0.0,0.0,0.155055,2.727036
6,Asthma prevalence,0.351237,-0.286569,0.288095,-0.422107,-0.119288,0.081612,2.135525
7,Coronary heart disease prevalence,0.218359,0.0,0.196804,0.0,0.0,0.130925,1.94237
8,Routine checkups,0.585981,0.307957,0.577132,0.0,0.0,0.182243,6.792277
9,Cholesterol screenings,0.0,0.218418,0.0,0.482751,0.0,0.082432,5.189869


<Figure size 1440x1080 with 0 Axes>