In [71]:
import h5py
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, LinearRegression
from haversine import haversine
from tqdm import tqdm

In [65]:
### Change input CSV path ###
nyc_health = pd.read_csv(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\nyc_health.csv')

In [66]:
nyc_health

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,closest_transport_nodes_travel_time,closest_transport_nodes_x,closest_transport_nodes_y,closest_worship_nodes,closest_worship_nodes_travel_time,closest_worship_nodes_x,closest_worship_nodes_y,nodes,nodes_x,nodes_y
0,NY,New York,New York,36061,36061008700,6338,6.3,"( 5.4, 7.7)",13.2,"(12.7, 13.8)",...,6.700000,-73.997437,40.741642,8.310246e+09,62.408497,-73.999496,40.738602,4.242737e+07,-73.997009,40.742229
1,NY,New York,New York,36061,36061008900,5708,9.7,"( 7.8, 12.1)",16.5,"(15.7, 17.4)",...,58.497746,-73.998482,40.745293,4.243427e+07,47.888412,-73.998970,40.744629,4.242739e+07,-74.002731,40.744640
2,NY,New York,New York,36061,36061027900,10292,25.5,"(20.6, 30.8)",18.8,"(17.9, 19.8)",...,21.145309,-73.928757,40.856129,8.740551e+09,36.409119,-73.929337,40.855564,4.242777e+07,-73.929726,40.856609
3,NY,New York,New York,36061,36061010900,183,8.4,"( 6.3, 11.8)",10.3,"( 9.8, 11.0)",...,7.300000,-73.987526,40.752186,6.593520e+09,64.018593,-73.984238,40.754047,4.242827e+07,-73.987335,40.752907
4,NY,New York,New York,36061,36061011300,117,14.9,"(11.8, 18.9)",12.8,"(12.0, 13.6)",...,0.000000,-73.986664,40.755051,6.593520e+09,77.035332,-73.984238,40.754047,4.242828e+07,-73.986664,40.755051
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3560,NY,New York,New York,36061,36061026700,2173,14.7,"(10.6, 22.8)",10.8,"(10.2, 11.5)",...,67.000000,-73.932907,40.850441,8.740551e+09,108.735710,-73.929337,40.855564,9.566870e+09,-73.929283,40.850563
3561,NY,New York,New York,36061,36061025100,2526,30.6,"(25.1, 36.0)",19.9,"(18.9, 20.9)",...,3.800000,-73.940125,40.840496,7.604384e+09,29.636023,-73.939384,40.842396,9.567003e+09,-73.940063,40.840336
3562,NY,New York,Kings,36047,36047097400,2724,20.0,"(16.6, 23.7)",21.9,"(21.0, 22.8)",...,101.698853,-73.899338,40.647060,4.249107e+07,172.659973,-73.908173,40.638813,9.572929e+09,-73.899162,40.647091
3563,NY,New York,New York,36061,36061017300,8939,7.9,"( 6.2, 10.1)",19.8,"(19.0, 20.9)",...,66.060715,-73.968391,40.786602,4.244248e+07,85.071449,-73.975014,40.787685,9.576549e+09,-73.970909,40.788406


In [67]:
### Change input HDF5 path ###
with h5py.File(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\nearest_nyc.hdf5', 'r') as f:
    nodes = f['nodes'][:]
    nodes_x = f['nodes_x'][:]
    nodes_y = f['nodes_y'][:]
    physical_proximity_x = f['closest_physical_nodes_x'][:]
    physical_proximity_y = f['closest_physical_nodes_y'][:]
    transport_proximity_x = f['closest_transport_nodes_x'][:]
    transport_proximity_y = f['closest_transport_nodes_y'][:]
smaller_bounds = (nodes_x > -74.4) & (nodes_x < -73.6) & (nodes_y < 41)
nodes = nodes[smaller_bounds]
nodes_x = nodes_x[smaller_bounds]
nodes_y = nodes_y[smaller_bounds]
physical_proximity_x = physical_proximity_x[smaller_bounds]
physical_proximity_y = physical_proximity_y[smaller_bounds]
transport_proximity_x = transport_proximity_x[smaller_bounds]
transport_proximity_y = transport_proximity_y[smaller_bounds]

In [68]:
# Calculate distances (this iteration didn't have them coming in)
def haversine_wrapper(lat1, lon1, lat2, lon2):
    return haversine((lat1, lon1), (lat2, lon2))
physical_dist = np.zeros_like(nodes_x)
transport_dist = np.zeros_like(nodes_x)
for i in tqdm(range(len(nodes_x))):
    physical_dist[i] = haversine_wrapper(nodes_y[i], nodes_x[i], physical_proximity_y[i], physical_proximity_x[i])
    transport_dist[i] = haversine_wrapper(nodes_y[i], nodes_x[i], transport_proximity_y[i], transport_proximity_x[i])
nyc_health['physical_dist'] = np.nan
nyc_health['transport_dist'] = np.nan
num_missed = 0
for i in tqdm(nyc_health.index, total=nyc_health.shape[0]):
    try:
        idx = np.where(nodes == nyc_health.at[i, 'nodes'])[0][0]
        nyc_health.at[i, 'physical_dist'] = physical_dist[idx]
        nyc_health.at[i, 'transport_dist'] = transport_dist[idx]
    except IndexError:
        num_missed += 1
print('Missed:', num_missed)

100%|████████████████████████████████████████████████████████████████████████| 32109/32109 [00:00<00:00, 118568.24it/s]
100%|███████████████████████████████████████████████████████████████████████████| 3565/3565 [00:00<00:00, 14715.42it/s]

Missed: 509





In [95]:
# For ease of readability
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}
# Get all the desired data
X = np.array([nyc_health['closest_education_nodes_travel_time'].to_numpy(), nyc_health['closest_food_nodes_travel_time'],
             nyc_health['closest_worship_nodes_travel_time'], nyc_health['physical_dist'], nyc_health['transport_dist']]).T
results = pd.DataFrame({'Health condition': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in nyc_health.columns if c.endswith('CrudePrev')]})
# Remove bad rows
nyc_health = nyc_health[(~np.isnan(X)).all(axis=1)]
X = X[(~np.isnan(X)).all(axis=1)]
i = 0
# Regress
for c in nyc_health.columns:
    if c.endswith('CrudePrev'):
        x = X.copy()
        y = nyc_health[c].to_numpy(copy=True)
        x = x[~np.isnan(y)]
        y = y[~np.isnan(y)]
        regr = ElasticNet()  # Could try others, or other parameters?
        regr.fit(x, y.reshape(-1, 1))
        results.iat[i, 0] = name_mapping[c[:-10].lower()]
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        #print(name_mapping[c[:-10].lower()], ':', regr.coef_)
        i += 1
print('NOTE: Education and public transport are distance based, and so I\'d expect them to be inversely related to the other (time-based) parameters')
results

NOTE: Education and public transport are distance based, and so I'd expect them to be inversely related to the other (time-based) parameters


Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship
0,Health insurance access,0.007311,-0.0,-0.0,-0.015571,3.815162e-06
1,Arthritis prevalence,0.009375,-0.0,0.0,0.006279,5.619086e-06
2,Binge drinking prevalence,-0.007588,-0.0,-0.0,-0.001392,-4.548146e-06
3,High blood pressure prevalence,0.015582,-0.0,0.0,-0.000475,9.353371e-06
4,Medium blood pressure prevalence,0.007729,0.0,0.0,0.007926,4.632336e-06
5,Cancer prevalence,0.001497,-0.0,0.0,0.004172,7.810641e-07
6,Asthma prevalence,0.003885,-0.0,-0.0,-0.002034,2.328652e-06
7,Cervical cancer screenings,0.005945,0.0,0.0,0.001295,2.753247e-06
8,Coronary heart disease prevalence,0.002983,-0.0,0.0,0.000244,1.787635e-06
9,Routine checkups,0.007024,0.0,0.0,0.002518,4.209762e-06


In [96]:
### Change output path ###
results.to_csv(r'C:\Users\willd\Documents\Georgia Tech\CSE6424\Project\elastic_net_regression.csv', index=False)