In [102]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet, ElasticNetCV, LassoCV, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, RobustScaler, scale, minmax_scale
import seaborn as sns 

In [138]:
#'atlanta'
#'houston'
#'nyc'
#'cinci'
#'seattle'
place_abbr = 'sf'

tracts_deserts_path = 'data/tract_desert_measures/sf_desert_tracts.csv'

save_data_dir = 'data/tract_health/'
save_data_name =  place_abbr+'_tract_health.csv'
save_transf_data_name =  place_abbr+'_tract_health_logt.csv'

save_regression_path = 'data/tract_health/results/'
save_lasso_name = place_abbr+'_tract_lasso_results'
save_enet_name = place_abbr+'_tract_enet_results'

# Aggregation/Cleaning

In [154]:
tracts_deserts.

Unnamed: 0,food_closest_travel_times,physical_closest_dist,transport_closest_dist,education_closest_travel_times,worship_closest_travel_times,TractFIPS
0,55.082780,0.024852,0.000259,0.000000,116.293950,06075010100
1,104.510900,0.106618,0.000181,109.609030,63.006050,06075010200
2,97.889350,0.000234,0.210090,16.621742,74.510390,06075010300
3,125.605340,0.046652,0.046652,66.053140,50.311584,06075010400
4,121.480840,0.000211,0.000211,87.040130,159.234540,06075010500
...,...,...,...,...,...,...
192,84.441340,0.285294,0.015445,133.434310,55.752556,06075980401
193,250.735890,0.000360,0.000360,54.229710,145.571400,06075980501
194,280.894650,0.279868,0.372384,159.898420,96.200000,06075980600
195,62.344883,0.118217,0.155635,99.950930,99.950930,06075980900


In [139]:
tracts_deserts = pd.read_csv(tracts_deserts_path, dtype={'GEOID':'str'})
tracts_deserts = tracts_deserts.rename(columns={'GEOID':'TractFIPS'})

In [140]:
tracts_health = pd.read_csv("data/PLACES__Census_Tract_Data__GIS_Friendly_Format___2021_release.csv", dtype={'TractFIPS':'str'})
tracts_health = pd.merge(tracts_health, tracts_deserts, on='TractFIPS')

income_vars = pd.read_csv('data/tract_incomevars.csv', dtype=str).astype({'PovertyRate':'float', 'TractFIPS':'str'})
tracts_health = pd.merge(tracts_health, income_vars, on='TractFIPS')

tracts_health.to_csv(save_data_dir+save_data_name, index=False)

In [141]:
desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']

tracts_health_logt = tracts_health.copy()
tracts_health_logt[desert_measures] = tracts_health_logt[desert_measures].apply(lambda x: np.log(x+1))
tracts_health_logt.to_csv(save_data_dir + save_transf_data_name, index=False)

In [152]:
tracts_health

Unnamed: 0,StateAbbr,StateDesc,CountyName,CountyFIPS,TractFIPS,TotalPopulation,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,TEETHLOST_Crude95CI,Geolocation,food_closest_travel_times,physical_closest_dist,transport_closest_dist,education_closest_travel_times,worship_closest_travel_times,LowIncomeTracts,PovertyRate,MedianFamilyIncome
0,CA,California,San Francisco,6075,06075017601,7630,11.1,"( 9.6, 12.9)",17.4,"(16.6, 18.1)",...,"( 7.3, 13.9)",POINT (-122.4107073 37.77944763),63.049397,0.000221,0.000221,59.742650,54.111760,1,22.977066,66154.0
1,CA,California,San Francisco,6075,06075016200,2541,7.9,"( 6.7, 9.4)",14.7,"(14.1, 15.3)",...,"( 3.9, 9.0)",POINT (-122.4228953 37.77716215),92.836464,0.157229,0.052362,21.975210,38.325047,0,8.665873,210042.0
2,CA,California,San Francisco,6075,06075030202,4313,6.9,"( 5.7, 8.7)",12.9,"(12.4, 13.5)",...,"( 2.9, 7.8)",POINT (-122.4673281 37.76308855),42.604160,0.282765,0.000159,0.000000,42.724552,0,12.881658,159695.0
3,CA,California,San Francisco,6075,06075016802,3264,8.5,"( 7.1, 10.5)",13.7,"(13.2, 14.4)",...,"( 5.2, 11.1)",POINT (-122.4249484 37.77355128),61.082450,0.052283,0.104761,41.643530,13.327935,0,13.579906,146875.0
4,CA,California,San Francisco,6075,06075025500,8471,14.0,"(11.8, 16.2)",16.8,"(16.2, 17.5)",...,"( 5.9, 10.9)",POINT (-122.4372418 37.7280668),129.039660,0.443646,0.201853,31.619670,31.619670,0,7.553366,116157.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,CA,California,San Francisco,6075,06075017700,2128,10.7,"( 8.9, 13.0)",12.2,"(11.7, 12.7)",...,"( 4.3, 8.6)",POINT (-122.4124937 37.76804375),42.746006,0.339886,0.147534,60.830540,100.488400,0,9.927984,111458.0
191,CA,California,San Francisco,6075,06075025900,4261,12.5,"(10.6, 14.8)",19.3,"(18.6, 20.1)",...,"( 5.8, 12.3)",POINT (-122.4109715 37.72348743),128.438160,0.182158,0.090931,47.924946,39.624947,0,5.406529,125430.0
192,CA,California,San Francisco,6075,06075022903,3384,13.9,"(11.1, 17.0)",13.4,"(12.8, 14.0)",...,"( 4.1, 9.6)",POINT (-122.4062321 37.75175786),67.561650,0.094492,0.000205,28.268590,92.972630,1,5.678233,93359.0
193,CA,California,San Francisco,6075,06075015300,2040,7.2,"( 5.5, 9.6)",15.1,"(14.4, 15.8)",...,"( 3.4, 9.1)",POINT (-122.439345 37.78676373),102.269714,0.178350,0.007977,0.000000,34.317577,0,5.812854,143750.0


# Analysis

In [143]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_all_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Poverty Rate': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})
results_nan = pd.DataFrame({'Health condition': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 'Poverty Rate': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'RSquared': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')], 
                            'MSE': [np.nan for c in tracts_health.columns if c.endswith('CrudePrev')]})
add_vars = ['PovertyRate']

In [144]:
def update_all_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.coef_[3]
        results.iat[i, 5] = regr.coef_[4]
        results.iat[i, 6] = regr.coef_[5]
        results.iat[i, 7] = regr.score(X_test, y_test)
        results.iat[i, 8] = mse_test
def update_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.score(X_test, y_test)
        results.iat[i, 5] = mse_test

## Run LASSO regression:
- For all deserts types; *all_deserts = True*
- Food and physical activity deserts only; *all_deserts = False*
- With log transformed desert measures; *log_transf = True*
- Without log transformed desert measures; *log_transf = False*

In [145]:
#set desert and log transformation params
all_deserts=True
log_transf = False

In [151]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = scale(y[~np.isnan(y)])
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='MSE').reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Poverty Rate,RSquared,MSE
0,Poor mental health prevalence,0.0468,-0.0084,0.871347,0.5967,0.206538
1,Teeth loss prevalence,-0.0259,0.0532,0.839325,0.4309,0.265918
2,Smoking prevalence,-0.0126,0.0606,0.815526,0.4838,0.299235
3,Asthma prevalence,0.116,-0.0737,0.725573,0.2165,0.337702
4,Dental checkups,0.0713,-0.1341,-0.819184,0.3921,0.410343
5,Poor physical health,-0.0935,0.0313,0.748746,0.2717,0.440784
6,Core women's health,0.0449,-0.1531,-0.731534,0.5981,0.443409
7,Poor sleep prevalence,0.0,0.1715,0.730648,0.3534,0.46602
8,Obesity prevalence,0.1194,-0.0675,0.583161,0.0819,0.47502
9,COPD prevalence,-0.1031,0.0,0.654041,0.1527,0.475655


## LASSO on Whole Dataset

In [148]:
#set desert and log transformation params
all_deserts=False
log_transf = False

In [149]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
    update_method = update_all_vars
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
    update_method = update_vars
if log_transf == True:
    df = tracts_health_logt
else:
    df = tracts_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        y = df[c].to_numpy(copy=True)
        xscale = xscale[~np.isnan(y)]
        y = scale(y[~np.isnan(y)])
        #create function to center data
        center_function = lambda x: x - x.mean()

        #apply function to original NumPy array
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        #l1 ratio is from suggested values in ElasticNetCV documentation
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)  # Could try others, or other parameters?
        regr.fit(xscale, y.reshape(-1, 1))
        
        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='MSE').reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Poverty Rate,RSquared,MSE
0,Poor mental health prevalence,0.0366,-0.0149,0.845967,0.6126,0.268632
1,Teeth loss prevalence,0.0097,0.009,0.781683,0.5,0.321957
2,Dental checkups,-0.0214,-0.0813,-0.779813,0.5141,0.351581
3,Smoking prevalence,0.0573,0.0332,0.779897,0.5589,0.362814
4,Core men's health,-0.0209,-0.1299,-0.794118,0.4429,0.37877
5,Core women's health,0.0,-0.0943,-0.766239,0.6388,0.400689
6,Poor sleep prevalence,0.1049,0.1276,0.682932,0.4702,0.433442
7,Poor physical health,0.0,0.0,0.699944,0.404,0.450187
8,Colon cancer screenings,-0.0397,-0.1334,-0.68748,0.4262,0.482173
9,Asthma prevalence,0.0452,-0.0839,0.692264,0.3041,0.483503


### Save Output:

In [150]:
if all_deserts == True:
    save_lasso_name += '_all'
if log_transf == True:
    save_lasso_name += '_logt'
results_round.to_csv(save_regression_path+save_lasso_name+'.csv', index=False)