In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet, ElasticNetCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, RobustScaler, scale
import seaborn as sns 

In [2]:
tracts_deserts_path = 'data/tract_desert_measures/atlanta_desert_tracts.csv'
place_abbr = 'atlanta'
save_data_name = 'data/tract_health' + place_abbr+'_tract_health.csv'
save_transf_data_name = 'data/tract_health' + place_abbr+'_tract_health_logt.csv'
save_results_name = 'data/tract_health/results' + place_abbr+'_tract_lasso_results.csv'

# Aggregation/Cleaning

In [3]:
tracts_deserts = pd.read_csv(tracts_deserts_path, dtype={'GEOID':'str'})
tracts_deserts = tracts_deserts.rename(columns={'GEOID':'TractFIPS'})

In [4]:
tracts_health = pd.read_csv("data/PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv", dtype={'TractFIPS':'str'})
tracts_health = pd.merge(tracts_health, tracts_deserts, on='TractFIPS')
tracts_health.to_csv(save_data_name, index=False)

In [5]:
desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']

tracts_health_logt = tracts_health.copy()
tracts_health_logt[desert_measures] = tracts_health_logt[desert_measures].apply(lambda x: np.log(x+1))
tracts_health_logt.to_csv(save_transf_data_name, index=False)

# Analysis

In [6]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})


In [7]:
tracts_health

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,STROKE_CrudePrev,STROKE_Crude95CI,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,Geolocation,food_closest_travel_times,physical_closest_dist,transport_closest_dist,education_closest_travel_times,worship_closest_travel_times
0,CA,California,San Francisco,6075,06075017601,7630,11.1,"( 9.6, 12.9)",17.4,"(16.6, 18.1)",...,2.7,"( 2.5, 3.0)",10.3,"( 7.3, 13.9)",POINT (-122.4107073 37.77944763),63.049397,0.000221,0.000221,59.742650,54.111760
1,CA,California,San Francisco,6075,06075016200,2541,7.9,"( 6.7, 9.4)",14.7,"(14.1, 15.3)",...,1.8,"( 1.7, 2.0)",6.0,"( 3.9, 9.0)",POINT (-122.4228953 37.77716215),92.836464,0.157229,0.052362,21.975210,38.325047
2,CA,California,San Francisco,6075,06075030202,4313,6.9,"( 5.7, 8.7)",12.9,"(12.4, 13.5)",...,1.4,"( 1.3, 1.6)",4.8,"( 2.9, 7.8)",POINT (-122.4673281 37.76308855),42.604160,0.282765,0.000159,0.000000,42.724552
3,CA,California,San Francisco,6075,06075016802,3264,8.5,"( 7.1, 10.5)",13.7,"(13.2, 14.4)",...,1.7,"( 1.6, 1.9)",7.6,"( 5.2, 11.1)",POINT (-122.4249484 37.77355128),61.082450,0.052283,0.104761,41.643530,13.327935
4,CA,California,San Francisco,6075,06075025500,8471,14.0,"(11.8, 16.2)",16.8,"(16.2, 17.5)",...,2.5,"( 2.3, 2.7)",8.3,"( 5.9, 10.9)",POINT (-122.4372418 37.7280668),129.039660,0.443646,0.201853,31.619670,31.619670
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,CA,California,San Francisco,6075,06075017700,2128,10.7,"( 8.9, 13.0)",12.2,"(11.7, 12.7)",...,1.5,"( 1.4, 1.7)",6.2,"( 4.3, 8.6)",POINT (-122.4124937 37.76804375),42.746006,0.339886,0.147534,60.830540,100.488400
191,CA,California,San Francisco,6075,06075025900,4261,12.5,"(10.6, 14.8)",19.3,"(18.6, 20.1)",...,3.1,"( 2.8, 3.4)",8.6,"( 5.8, 12.3)",POINT (-122.4109715 37.72348743),128.438160,0.182158,0.090931,47.924946,39.624947
192,CA,California,San Francisco,6075,06075022903,3384,13.9,"(11.1, 17.0)",13.4,"(12.8, 14.0)",...,1.8,"( 1.6, 1.9)",6.5,"( 4.1, 9.6)",POINT (-122.4062321 37.75175786),67.561650,0.094492,0.000205,28.268590,92.972630
193,CA,California,San Francisco,6075,06075015300,2040,7.2,"( 5.5, 9.6)",15.1,"(14.4, 15.8)",...,1.8,"( 1.7, 2.1)",5.7,"( 3.4, 9.1)",POINT (-122.439345 37.78676373),102.269714,0.178350,0.007977,0.000000,34.317577


In [8]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health_logt.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health_logt[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health_logt[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        data_centered = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        enet_cv = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1], 
                                     cv = 10, normalize=True).fit(X_train,y_train)
        
        alpha = enet_cv.alpha_
        l1 = enet_cv.l1_ratio_
        alphas.append(alpha)
        l1_ratios.append(l1)
        
        regr = ElasticNet(alpha=alpha, l1_ratio = l1, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test
            
        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Obesity prevalence,-0.1604,0.1417,0.1806,-0.0215,-0.06553237,0.0635,3.934956
1,Depression prevalence,-0.2942,-0.0105,0.2193,-0.065,-0.1982156,0.0307,3.30696
2,Cholesterol screenings,0.2164,-0.022,-0.1946,0.1369,0.3560348,0.0218,6.633779
3,Health insurance access,0.0,0.0,-0.0,-0.0,-0.0,-0.005,18.500785
4,Poor mental health prevalence,-0.1001,0.0498,0.0487,-0.0212,-0.09367039,-0.0084,2.419926
5,Asthma prevalence,-0.0442,0.0192,0.0237,0.0,-0.01631811,-0.0129,0.420385
6,Core men's health,-0.0,-0.0,0.0,0.0,0.0,-0.0155,14.13593
7,Core women's health,-0.0,-0.0,0.0,0.0,0.0,-0.0163,16.38478
8,Cancer prevalence,0.0,0.0,-0.0,-0.0,9.753021e-19,-0.0194,1.638241
9,Poor sleep prevalence,0.0,0.0,-0.0,0.0,-0.0,-0.02,7.460701


<Figure size 1440x1080 with 0 Axes>

In [9]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Poor sleep prevalence,0.1286,0.2789,-0.6704,0.9596,-0.406477,0.1236,7.176381
1,Smoking prevalence,-0.2826,0.4479,-0.4581,0.6058,-0.456061,0.0638,4.66672
2,Colon cancer screenings,-0.3202,-0.4151,1.1461,-1.1366,0.981981,0.0393,26.213198
3,Dental checkups,0.0,-0.9576,1.4197,-1.2906,1.524499,0.0359,57.083825
4,Core men's health,-0.0355,-0.3671,0.8312,-0.7585,0.73996,0.0295,14.748725
5,Teeth loss prevalence,-0.3542,0.6331,-0.7267,0.8001,-0.74973,0.0203,10.356029
6,Depression prevalence,-0.2295,0.0,0.1055,0.0,-0.0,0.0095,3.228459
7,Poor mental health prevalence,-0.2177,0.0061,0.0,0.3865,-0.265812,0.0065,2.435878
8,Health insurance access,0.0,0.0,-0.0,0.0,-0.0,-0.0009,18.761068
9,Core women's health,-0.0,-0.0,0.0,-0.0,0.0,-0.0036,16.749602


<Figure size 1440x1080 with 0 Axes>

In [10]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health_logt.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = np.log(tracts_health[desert_measures].to_numpy(copy=True)+1)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health_logt[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Obesity prevalence,-0.2932,0.2425,0.3032,-0.0,-0.01125441,0.043,4.337909
1,Depression prevalence,-0.3289,-0.0,0.1858,-0.0,-0.0616412,0.0051,3.242945
2,Poor sleep prevalence,0.0,0.0,-0.0,-0.0,-0.0,-0.0001,8.189117
3,Colon cancer screenings,-0.0,-0.0,0.0,0.0,1.139132e-15,-0.0003,27.292972
4,Health insurance access,0.0,0.0,-0.0,-0.0,-0.0,-0.0009,18.761068
5,Cholesterol screenings,0.1954,-0.0,-0.2895,0.1797,0.7225145,-0.0032,6.713107
6,Core women's health,-0.0,-0.0,0.0,0.0,7.747646000000001e-17,-0.0036,16.749602
7,No physical activity,-0.0,0.0,-0.0,-0.0,-0.0,-0.0088,32.394534
8,High cholesterol prevalence,0.0,0.0,-0.0,-0.0,0.0,-0.0112,16.328643
9,Diabetes prevalence,0.0,0.0,-0.0,-0.0,-0.0,-0.0124,10.369728


<Figure size 1440x1080 with 0 Axes>

In [11]:
alphas = []
l1_ratios = []
i=0
results = results_nan.copy()

plt.figure(figsize=(20, 15))
plt.subplots_adjust(hspace=0.5)

for c in tracts_health.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = tracts_health[desert_measures].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = tracts_health[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = y[~np.isnan(y)]
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(xscale,y)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(xscale, y.reshape(-1, 1))
        
        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)
        
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[1]
        results.iat[i, 2] = regr.coef_[3]
        results.iat[i, 3] = regr.coef_[4]
        results.iat[i, 4] = regr.coef_[0]
        results.iat[i, 5] = regr.coef_[2]
        results.iat[i, 6] = regr.score(X_test, y_test)
        results.iat[i, 7] = mse_test

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4}).sort_values(by='RSquared', ascending=False).reset_index(drop=True)
results_round.to_csv(save_results_name)
results_round

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,RSquared,MSE
0,Teeth loss prevalence,-0.1952,0.2385,-0.2013,0.8071,-0.7056058,0.0599,21.194859
1,No physical activity,-0.0,0.0,-0.2967,1.0125,-0.4962532,0.0561,40.342329
2,Health insurance access,0.0,0.0,-0.0,0.588,-0.405796,0.0512,19.706188
3,General poor health prevalence,-0.2406,0.3909,-0.5617,1.0449,-0.7359876,0.0368,33.859553
4,Smoking prevalence,-0.225,0.3269,-0.3214,0.6662,-0.4445138,0.0339,7.998354
5,Poor mental health prevalence,-0.2243,0.0401,0.0,0.3142,-0.3109198,0.0314,4.435516
6,Poor physical health,-0.2571,0.3015,-0.3014,0.4658,-0.3722701,0.0177,7.525791
7,Poor sleep prevalence,0.0501,0.0,-0.3128,0.8627,-0.1295721,0.0153,10.452071
8,Obesity prevalence,-0.5049,0.4574,-0.0734,0.3775,-0.252414,-0.0008,8.097444
9,Asthma prevalence,-0.1633,0.0572,0.0,0.1271,-0.09394009,-0.0086,0.845354


<Figure size 1440x1080 with 0 Axes>