In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet, ElasticNetCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, RobustScaler, scale
import seaborn as sns 

In [2]:
place_abbr = 'nyc'

tracts_deserts_path = 'data/tract_desert_measures/nyc_desert_tracts.csv'

save_data_dir = 'data/tract_health/'
save_data_name =  place_abbr+'_tract_health.csv'
save_transf_data_name =  place_abbr+'_tract_health_logt.csv'

save_regression_path = 'data/tract_health/results/'
save_lasso_name = place_abbr+'_tract_lasso_results.csv'
save_enet_name = place_abbr+'_tract_enet_results.csv'

# Aggregation/Cleaning

In [3]:
tracts_deserts = pd.read_csv(tracts_deserts_path, dtype={'GEOID':'str'})
tracts_deserts = tracts_deserts.rename(columns={'GEOID':'TractFIPS'})

In [4]:
tracts_health = pd.read_csv("data/PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv", dtype={'TractFIPS':'str'})
tracts_health = pd.merge(tracts_health, tracts_deserts, on='TractFIPS')

income_vars = pd.read_csv('data/tract_incomevars.csv', dtype=str).astype({'PovertyRate':'float'})
tracts_health = pd.merge(tracts_health, income_vars, on='TractFIPS')

tracts_health.to_csv(save_data_dir+save_data_name, index=False)

In [5]:
desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']

tracts_health_logt = tracts_health.copy()
tracts_health_logt[desert_measures] = tracts_health_logt[desert_measures].apply(lambda x: np.log(x+1))
tracts_health_logt.to_csv(save_data_dir + save_transf_data_name, index=False)

# Analysis

In [6]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_all_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Poverty Rate': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})
results_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Poverty Rate': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})
add_vars = ['PovertyRate']

In [7]:
def update_all_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.coef_[3]
        results.iat[i, 5] = regr.coef_[4]
        results.iat[i, 6] = regr.coef_[5]
        results.iat[i, 7] = regr.score(X_test, y_test)
        results.iat[i, 8] = mse_test
def update_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.score(X_test, y_test)
        results.iat[i, 5] = mse_test

## Run Elastic Net Regression:
- For all deserts types; *all_deserts = True*
- Food and physical activity deserts only; *all_deserts = False*
- With log transformed desert measures; *log_transf = True*
- Without log transformed desert measures; *log_transf = False*

In [8]:
#set desert and log transformation params
all_deserts=False
log_transf = True

In [9]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        data_centered = center_function(y)

        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)

        #l1 ratio is from suggested values in ElasticNetCV documentation
        enet_cv = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], 
                                     cv = 10, normalize=True).fit(X_train,y_train)

        alpha = enet_cv.alpha_
        l1 = enet_cv.l1_ratio_
        alphas.append(alpha)
        l1_ratios.append(l1)

        regr = ElasticNet(alpha=alpha, l1_ratio = l1, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))

        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)

        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Poverty Rate,RSquared,MSE
0,Poor mental health prevalence,0.2483,0.1969,2.431243,0.7236,2.470583
1,Teeth loss prevalence,0.5871,0.8718,6.197635,0.708,14.616757
2,Poor physical health,0.4179,0.2769,2.887272,0.6932,3.863551
3,General poor health prevalence,0.804,0.5639,6.220211,0.6633,20.808676
4,Smoking prevalence,0.4886,0.482,3.160344,0.6502,6.204117
5,Dental checkups,-0.5666,-1.3496,-8.039199,0.6333,40.874105
6,No physical activity,1.1559,1.1033,6.262747,0.5624,31.709987
7,Colon cancer screenings,-0.1597,-0.825,-5.419017,0.5421,26.690753
8,Health insurance access,0.2797,0.532,5.564608,0.5375,27.967114
9,Chronic kidney disease,0.1168,0.0461,0.594865,0.478,0.355956


## Run Elastic Net for Whole Dataset

In [10]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        data_centered = center_function(y)

        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)

        #l1 ratio is from suggested values in ElasticNetCV documentation
        enet_cv = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], 
                                     cv = 10, normalize=True).fit(X_train,y_train)

        alpha = enet_cv.alpha_
        l1 = enet_cv.l1_ratio_
        alphas.append(alpha)
        l1_ratios.append(l1)

        regr = ElasticNet(alpha=alpha, l1_ratio = l1, normalize=True)  # Could try others, or other parameters?
        regr.fit(xscale, y.reshape(-1, 1))

        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)

        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Poverty Rate,RSquared,MSE
0,Poor mental health prevalence,0.2441,0.1897,2.438508,0.7238,2.406894
1,Teeth loss prevalence,0.5863,0.8703,6.107925,0.709,16.061905
2,Poor physical health,0.3917,0.2613,2.87614,0.694,4.217976
3,General poor health prevalence,0.7291,0.5264,6.215885,0.6643,20.918434
4,Smoking prevalence,0.4887,0.4574,3.192353,0.6509,5.77889
5,Dental checkups,-0.4568,-1.2735,-8.100043,0.6354,41.897013
6,No physical activity,1.0455,1.049,6.251513,0.5642,31.139052
7,Colon cancer screenings,-0.1092,-0.8345,-5.442141,0.5427,25.976
8,Health insurance access,0.2341,0.518,5.556806,0.538,25.658655
9,Chronic kidney disease,0.1055,0.0386,0.581608,0.4822,0.415208


### Save Output:

In [11]:
results_round.to_csv(save_regression_path+save_enet_name, index=False)

## Run LASSO regression:
- For all deserts types; *all_deserts = True*
- Food and physical activity deserts only; *all_deserts = False*
- With log transformed desert measures; *log_transf = True*
- Without log transformed desert measures; *log_transf = False*

In [12]:
#set desert and log transformation params
all_deserts=True
log_transf = True

In [14]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,Poverty Rate,RSquared,MSE
0,Teeth loss prevalence,0.4512,0.808,0.3822,0.0058,-0.001471,6.166906,0.7239,13.86758
1,Poor mental health prevalence,0.206,0.1589,0.1134,0.0,0.001487,2.471077,0.693,2.653855
2,Poor physical health,0.3115,0.2795,0.2901,0.0663,0.089727,2.93462,0.6921,4.063333
3,General poor health prevalence,0.6495,0.5276,0.3336,0.1381,-0.0,6.253223,0.6633,21.062212
4,Smoking prevalence,0.389,0.439,0.3274,0.0433,0.054701,3.246949,0.6447,6.087566
5,Dental checkups,-0.4178,-1.3326,-0.3997,-0.4218,0.17894,-8.202774,0.621,41.720174
6,No physical activity,0.8758,1.0153,0.6513,0.4021,-0.0,6.386305,0.5697,31.255079
7,Colon cancer screenings,-0.0,-0.7061,-0.1865,-0.2183,-0.0,-5.449471,0.5318,26.594468
8,Health insurance access,0.2444,0.5094,-0.0102,0.2234,-0.232405,5.638137,0.531,27.714459
9,COPD prevalence,0.1589,0.2043,0.2595,0.023,0.140222,1.292272,0.4852,2.139315


## LASSO on Whole Dataset

In [15]:
#set desert and log transformation params
all_deserts=True
log_transf = True

In [17]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(xscale, y.reshape(-1, 1))
        
        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,Poverty Rate,RSquared,MSE
0,Teeth loss prevalence,0.4567,0.786,0.3575,0.0066,-0.0,6.101372,0.7242,15.923705
1,Poor mental health prevalence,0.1986,0.152,0.0752,-0.0,0.019271,2.416911,0.6948,2.399281
2,Poor physical health,0.3,0.2321,0.2833,0.0703,0.057244,2.925468,0.6936,4.125421
3,General poor health prevalence,0.6026,0.4295,0.2954,0.1427,-0.02633,6.201759,0.6645,20.779763
4,Smoking prevalence,0.3833,0.4181,0.2916,0.0199,0.093216,3.223655,0.6458,5.67485
5,Dental checkups,-0.3747,-1.1963,-0.3082,-0.368,0.224924,-8.135274,0.6241,41.632377
6,No physical activity,0.8081,0.9055,0.6203,0.3741,-0.0,6.297549,0.5714,30.530788
7,Health insurance access,0.2609,0.4814,-0.0652,0.188,-0.245611,5.534593,0.5325,25.557763
8,Colon cancer screenings,-0.0,-0.6844,-0.1078,-0.1997,-0.0,-5.352157,0.5323,25.899134
9,COPD prevalence,0.153,0.1775,0.2719,0.0279,0.123352,1.31038,0.4874,1.997414


### Save Output:

In [18]:
results_round.to_csv(save_regression_path+save_lasso_name, index=False)