In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet, ElasticNetCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, RobustScaler, scale
import seaborn as sns 

In [2]:
tracts_deserts_path = 'data/tract_desert_measures/atlanta_desert_tracts.csv'
place_abbr = 'atlanta'
save_data_name = 'data/tract_health' + place_abbr+'_tract_health.csv'
save_transf_data_name = 'data/tract_health' + place_abbr+'_tract_health_logt.csv'
save_results_name = 'data/tract_health/results' + place_abbr+'_tract_lasso_results.csv'

# Aggregation/Cleaning

In [3]:
tracts_deserts = pd.read_csv(tracts_deserts_path, dtype={'GEOID':'str'})
tracts_deserts = tracts_deserts.rename(columns={'GEOID':'TractFIPS'})

In [4]:
tracts_health = pd.read_csv("data/PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv", dtype={'TractFIPS':'str'})
tracts_health = pd.merge(tracts_health, tracts_deserts, on='TractFIPS')
tracts_health.to_csv(save_data_name, index=False)

In [5]:
desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']

tracts_health_logt = tracts_health.copy()
tracts_health_logt[desert_measures] = tracts_health_logt[desert_measures].apply(lambda x: np.log(x+1))
tracts_health_logt.to_csv(save_transf_data_name, index=False)

# Analysis

In [6]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})


In [7]:
tracts_health

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,STROKE_CrudePrev,STROKE_Crude95CI,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,Geolocation,food_closest_travel_times,physical_closest_dist,transport_closest_dist,education_closest_travel_times,worship_closest_travel_times
0,GA,Georgia,Fulton,13121,13121010204,4761,9.6,"( 7.5, 12.7)",24.8,"(23.4, 26.2)",...,2.6,"( 2.3, 3.1)",4.4,"( 2.6, 8.2)",POINT (-84.41716366 33.92519129),258.6,1.051088,4.476390,103.4,103.9
1,GA,Georgia,Fulton,13121,13121005501,2307,25.2,"(21.3, 29.6)",20.0,"(18.9, 20.9)",...,3.9,"( 3.4, 4.4)",23.5,"(14.5, 33.7)",POINT (-84.38372162 33.72987572),130.1,0.278257,0.821833,14.1,17.0
2,GA,Georgia,Fulton,13121,13121001202,3937,12.6,"(11.5, 14.2)",10.0,"( 9.6, 10.4)",...,1.1,"( 1.0, 1.2)",4.6,"( 3.4, 6.4)",POINT (-84.38515368 33.77682317),72.2,0.620474,0.266751,37.1,0.0
3,GA,Georgia,Fulton,13121,13121011425,6906,10.8,"( 8.9, 13.5)",20.0,"(18.9, 21.3)",...,2.1,"( 1.8, 2.4)",4.8,"( 2.8, 7.7)",POINT (-84.24164477 34.01149372),289.4,0.963302,3.946356,273.9,111.7
4,GA,Georgia,Fulton,13121,13121009403,4625,18.4,"(15.7, 21.9)",13.5,"(12.9, 14.1)",...,1.9,"( 1.7, 2.1)",9.8,"( 5.9, 17.5)",POINT (-84.35991176 33.83156797),169.2,0.299675,1.285113,195.7,141.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197,GA,Georgia,Fulton,13121,13121011623,11473,13.6,"(11.9, 15.4)",16.1,"(15.5, 16.9)",...,1.8,"( 1.7, 2.0)",5.8,"( 3.9, 8.6)",POINT (-84.19032903 34.03959659),289.4,0.963302,3.946356,273.9,111.7
198,GA,Georgia,Fulton,13121,13121000600,5203,20.0,"(17.9, 22.5)",8.4,"( 8.1, 8.7)",...,1.2,"( 1.1, 1.3)",13.4,"(10.3, 17.5)",POINT (-84.40584642 33.78496362),119.0,0.563022,0.387744,54.9,46.4
199,GA,Georgia,Fulton,13121,13121009603,4388,14.2,"(12.2, 17.5)",15.5,"(14.9, 16.2)",...,1.8,"( 1.6, 2.0)",5.9,"( 3.5, 11.6)",POINT (-84.37581823 33.83437332),50.4,0.151147,1.358243,31.6,74.0
200,GA,Georgia,Fulton,13121,13121003900,1331,29.2,"(25.2, 33.5)",26.8,"(25.9, 27.8)",...,6.4,"( 5.9, 7.1)",29.4,"(20.8, 39.1)",POINT (-84.42210552 33.75005537),115.2,0.180219,0.821719,20.4,22.3


In [8]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health_logt.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health_logt[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health_logt[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        data_centered = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        enet_cv = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], 
                                     cv = 10, normalize=True).fit(X_train,y_train)
        
        alpha = enet_cv.alpha_
        l1 = enet_cv.l1_ratio_
        alphas.append(alpha)
        l1_ratios.append(l1)
        
        regr = ElasticNet(alpha=alpha, l1_ratio = l1, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test
            
        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Mammograms,-0.2212,-0.6048,-0.1667,-0.3088,0.338724,0.364,1.330052
1,Routine checkups,-0.0,-1.6251,0.074,0.6232,2.368616,0.1867,10.06247
2,Obesity prevalence,-0.1687,-2.9856,-1.283,1.1287,2.646939,0.1866,50.39589
3,Core men's health,0.3283,2.5598,1.458,-0.6289,-1.507327,0.1783,45.257121
4,Poor sleep prevalence,-0.4577,-2.1172,-1.4277,0.5053,2.083155,0.1567,36.461682
5,High blood pressure prevalence,0.0,-4.651,-0.0447,2.1743,4.183252,0.1551,73.709136
6,Cholesterol screenings,0.2699,-0.0,0.5549,0.1215,1.035238,0.1497,9.495159
7,Cervical cancer screenings,0.2752,0.3984,0.4731,-0.1084,0.474314,0.147,8.386475
8,Teeth loss prevalence,-0.3015,-2.6953,-1.6579,1.5909,1.010453,0.1453,75.071514
9,Dental checkups,0.3294,3.5585,2.6162,-1.5262,-1.847335,0.1423,162.545397


<Figure size 1440x1080 with 0 Axes>

In [9]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Mammograms,-0.3298,-1.0164,-0.0,0.0691,0.705301,0.496,1.105436
1,Core women's health,0.0,3.4246,0.533,-1.4371,-1.756193,0.2726,28.951943
2,Routine checkups,0.1485,-2.3979,0.5169,1.515,1.916573,0.2528,9.792345
3,Obesity prevalence,0.0,-5.3847,-0.3848,3.3402,2.961345,0.2339,46.685688
4,High blood pressure prevalence,0.7142,-6.7168,0.0207,4.6741,3.45912,0.2299,68.003822
5,Core men's health,0.139,4.7395,0.8402,-2.6138,-2.172101,0.1927,42.586556
6,Poor sleep prevalence,-0.5867,-4.0842,-1.0984,2.5067,3.195386,0.1815,33.894499
7,Diabetes prevalence,0.3631,-3.7039,0.0,2.7296,1.39704,0.18,23.02588
8,Stroke prevalence,0.1357,-1.3538,0.0,0.9864,0.435177,0.163,2.919132
9,Asthma prevalence,-0.1491,-1.4195,-0.1684,0.8234,0.825877,0.1552,2.85923


<Figure size 1440x1080 with 0 Axes>

In [10]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health_logt.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = np.log(tracts_health[desert_measures].to_numpy(copy=True)+1)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health_logt[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Mammograms,-0.2812,-0.697,-0.177,-0.3498,0.463365,0.3413,1.444706
1,Cholesterol screenings,0.1393,-0.0,0.5067,0.0007,1.150573,0.1842,8.682611
2,Core men's health,0.0,3.2088,1.2172,-0.5095,-1.564761,0.1465,45.022378
3,Cervical cancer screenings,0.0,0.5272,0.4017,-0.0,0.600197,0.1262,8.05169
4,Routine checkups,0.0,-2.2174,0.2235,0.7024,2.627308,0.1156,11.591033
5,Poor mental health prevalence,-0.0,-0.9555,-0.7273,0.0602,0.0,0.1137,15.368493
6,Obesity prevalence,-0.0,-4.1302,-1.0365,1.1308,3.257497,0.1002,54.83232
7,High cholesterol prevalence,0.8529,-1.5876,0.8235,1.5285,1.67056,0.0976,18.325025
8,Dental checkups,0.0,5.36,2.2012,-1.6189,-2.573989,0.0776,165.236244
9,High blood pressure prevalence,0.0,-5.2841,-0.0,2.0366,4.358984,0.0765,81.549844


<Figure size 1440x1080 with 0 Axes>

In [11]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(xscale,y)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(xscale, y.reshape(-1, 1))
        
        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4}).sort_values(by='RSquared', ascending=False).reset_index(drop=True)
results_round.to_csv(save_results_name)
results_round

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Poor mental health prevalence,-0.0,-2.0619,-0.7075,0.8717,0.750173,0.0937,14.641736
1,Smoking prevalence,0.2022,-3.1096,-1.0135,1.8386,1.611182,0.0549,28.092201
2,Health insurance access,0.5506,-3.6064,-1.4433,2.1183,1.752038,0.051,51.149097
3,Poor sleep prevalence,-0.126,-3.7958,-1.0859,1.7656,2.782301,0.0468,28.960101
4,Asthma prevalence,-0.0,-1.2328,-0.1878,0.5452,0.655569,0.0432,2.700116
5,Mammograms,-0.2452,-1.0111,0.0,0.0,0.665315,0.0394,1.217356
6,COPD prevalence,0.4366,-1.5438,-0.1121,1.0321,0.391139,0.0277,5.176842
7,Poor physical health,0.6465,-2.9319,-0.4417,1.8618,1.036275,0.026,16.908452
8,Stroke prevalence,0.2339,-1.3322,-0.0,0.8004,0.404737,0.0241,2.680505
9,Chronic kidney disease,0.163,-0.8695,-0.0,0.528,0.24782,0.0176,1.266482


<Figure size 1440x1080 with 0 Axes>