In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import  LassoCV, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler, scale

In [2]:
####################################### SET CITY VARS HERE ###########################################################

place_abbr = 'cinci'
#place_abbr = 'seattle'
#place_abbr = 'nyc'
#place_abbr = 'houston'
#place_abbr = 'sf'
#place_abbr = 'atlanta'

tracts_deserts_path = 'data/tract_desert_measures/cinci_desert_tracts.csv'

save_data_dir = 'data/zcta_health/'
save_data_name =  place_abbr+'_zcta_health.csv'
save_transf_data_name =  place_abbr+'_zcta_health_logt.csv'

save_regression_path = 'data/zcta_health/results/'
save_lasso_name = place_abbr+'_zcta_lasso_results'

# Aggregation/Cleaning

In [3]:
tracts_deserts = pd.read_csv(tracts_deserts_path, dtype={'GEOID':'str'})

In [4]:
zip_to_tract = pd.read_csv("data/zcta_to_tract10.csv", dtype=str)
zip_to_tract = zip_to_tract[['GEOID', 'ZCTA5']]

In [5]:
tracts_zcta_deserts = pd.merge(tracts_deserts, zip_to_tract, on='GEOID').groupby('ZCTA5', as_index=False).median()

In [6]:
zcta_health = pd.read_csv("data/PLACES__ZCTA_Data__GIS_Friendly_Format___2021_release.csv", dtype={'ZCTA5':'str', 'GEOID':'str'})
zcta_health = pd.merge(zcta_health, tracts_zcta_deserts, on='ZCTA5')

income_vars = pd.read_csv('data/ZCTA_median_incomes.csv', dtype=str).rename(columns={'ZCTA':'ZCTA5'})
zcta_health = pd.merge(zcta_health, income_vars, on='ZCTA5')

zcta_health.to_csv(save_data_dir+save_data_name, index=False)

In [7]:
zcta_health.loc[zcta_health['Household median income'] == '-', 'Household median income'] = 'nan'
zcta_health['Household median income'] = zcta_health['Household median income'].str.replace('+', '').str.replace(',','')
zcta_health = zcta_health.astype({'Household median income':'float'})
zcta_health['Household median income'] = zcta_health['Household median income'].fillna(zcta_health['Household median income'].median())

  zcta_health['Household median income'] = zcta_health['Household median income'].str.replace('+', '').str.replace(',','')


In [8]:
desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']

zcta_health_logt = zcta_health.copy()
zcta_health_logt[desert_measures] = zcta_health_logt[desert_measures].apply(lambda x: np.log(x+1))
zcta_health_logt.to_csv(save_transf_data_name, index=False)

# Analysis

In [9]:
name_mapping = {'access2': 'Health insurance access', 'arthritis': 'Arthritis prevalence', 'binge': 'Binge drinking prevalence',
               'bphigh': 'High blood pressure prevalence', 'bpmed': 'Medium blood pressure prevalence', 'cancer': 'Cancer prevalence',
               'casthma': 'Asthma prevalence', 'cervical': 'Cervical cancer screenings', 'chd': 'Coronary heart disease prevalence',
               'checkup': 'Routine checkups', 'cholscreen': 'Cholesterol screenings', 'colon_screen': 'Colon cancer screenings',
               'copd': 'COPD prevalence', 'corem': 'Core men\'s health', 'corew': 'Core women\'s health', 'csmoking': 'Smoking prevalence',
               'dental': 'Dental checkups', 'depression': 'Depression prevalence', 'diabetes': 'Diabetes prevalence', 'ghlth': 'General poor health prevalence',
               'highchol': 'High cholesterol prevalence', 'kidney': 'Chronic kidney disease', 'lpa': 'No physical activity', 'mammouse': 'Mammograms',
               'mhlth': 'Poor mental health prevalence', 'obesity': 'Obesity prevalence', 'phlth': 'Poor physical health', 'sleep': 'Poor sleep prevalence',
               'stroke': 'Stroke prevalence', 'teethlost': 'Teeth loss prevalence'}

results_all_nan = pd.DataFrame({'Health condition': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'Public transport': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')],
                        'Education': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'Houses of worship': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')],
                       'Household Median Income': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'RSquared': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 
                            'MSE': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')]})

results_nan = pd.DataFrame({'Health condition': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'Food': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')],
                       'Physical health': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'Household Median Income': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')],
                       'RSquared': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')], 'MSE': [np.nan for c in zcta_health.columns if c.endswith('CrudePrev')]})

desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
add_vars = ['Household median income']

In [10]:
def update_all_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.coef_[3]
        results.iat[i, 5] = regr.coef_[4]
        results.iat[i, 6] = regr.coef_[5]
        results.iat[i, 7] = regr.score(X_test, y_test)
        results.iat[i, 8] = mse_test
def update_vars():
        results.iat[i, 0] = name
        results.iat[i, 1] = regr.coef_[0]
        results.iat[i, 2] = regr.coef_[1]
        results.iat[i, 3] = regr.coef_[2]
        results.iat[i, 4] = regr.score(X_test, y_test)
        results.iat[i, 5] = mse_test

## Run LASSO regression:
- For all deserts types; *all_deserts = True*
- Food and physical activity deserts only; *all_deserts = False*
- With log transformed desert measures; *log_transf = True*
- Without log transformed desert measures; *log_transf = False*

In [11]:
#set desert and log transformation params
all_deserts=False
log_transf = False

In [17]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
if log_transf == True:
    df = zcta_health_logt
else:
    df = zcta_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        
        y = df[c].to_numpy(copy=True)
        
        xscale = xscale[~np.isnan(y)]
        
        y = scale(y[~np.isnan(y)])
        center_function = lambda x: x - x.mean()
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(X_test)
        y_train_pred = regr.predict(X_train)
        mse_test = mean_squared_error(y_test, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

Unnamed: 0,Health condition,Food,Physical health,Public transport,Education,Houses of worship,Household Median Income,RSquared,MSE
0,Core men's health,0.0,0.0,0.0,0.0,0.018749,0.753834,0.8346,0.18097
1,Core women's health,0.0,-0.0,0.0,0.0,0.032849,0.726033,0.8222,0.246308
2,Dental checkups,0.0,-0.0,0.0,0.0,0.0,0.748867,0.7962,0.215328
3,Asthma prevalence,-0.0,0.0,-0.0,0.0,-0.0,-0.70654,0.7882,0.217771
4,Poor sleep prevalence,-0.0,0.0,-0.0,-0.0,-0.0,-0.697416,0.7873,0.222434
5,Colon cancer screenings,0.0,-0.0,0.0301,0.0,0.0,0.734505,0.7864,0.216867
6,No physical activity,0.0,0.0,-0.078,-0.0,-0.0,-0.776141,0.7776,0.197986
7,Teeth loss prevalence,-0.0,0.0,-0.0,-0.0,-0.0,-0.713311,0.7772,0.255024
8,Health insurance access,-0.0,0.0,-0.0466,-0.0,-0.0,-0.71793,0.7658,0.253957
9,General poor health prevalence,0.0,0.0,-0.0777,-0.0,-0.0,-0.715087,0.746,0.222631


## Test LASSO on Whole Dataset

In [13]:
#set desert and log transformation params
all_deserts=True
log_transf = False

In [14]:
if all_deserts == True:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist', 'transport_closest_dist', 'education_closest_travel_times', 'worship_closest_travel_times']
    results = results_all_nan.copy()
else:
    desert_measures = ['food_closest_travel_times', 'physical_closest_dist']
    results = results_nan.copy()
if log_transf == True:
    df = zcta_health_logt
else:
    df = zcta_health
alphas = []
l1_ratios = []
i=0


for c in df.columns:
    if c.endswith('CrudePrev'):
        name = name_mapping[c[:-10].lower()]
        
        x = df[desert_measures+add_vars].to_numpy(copy=True)
        scaler = StandardScaler()
        xscale = scaler.fit_transform(x)
        
        y = df[c].to_numpy(copy=True)
        
        xscale = xscale[~np.isnan(y)]
        
        y = scale(y[~np.isnan(y)])
        center_function = lambda x: x - x.mean()
        y = center_function(y)
        
        X_train, X_test, y_train, y_test = train_test_split(xscale, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=42)
        
        lasso_cv = LassoCV(cv = 5, normalize=True).fit(X_train,y_train)
        
        alpha = lasso_cv.alpha_
        alphas.append(alpha)
        
        regr = Lasso(alpha=alpha, normalize=True)
        regr.fit(X_train, y_train.reshape(-1, 1))
        
        predictions = regr.predict(xscale)
        mse_test = mean_squared_error(y, predictions)
        
        if all_deserts==True:
            update_all_vars()
        else:
            update_vars()

        i += 1

results_round = results.round({'Food': 4, 'Physical health':4, 'Public transport':4, 'Education':4,
             'House of worship':4, 'RSquared':4})
results_round.sort_values(by='RSquared', ascending=False).reset_index(drop=True)

NameError: name 'StandardScaler' is not defined

# Save Output:

In [15]:
if all_deserts == True:
    save_lasso_name_ = save_lasso_name + '_all'
else:
    save_lasso_name_ = save_lasso_name
if log_transf == True:
    save_lasso_name_ = save_lasso_name + '_logt'
else:
    save_lasso_name_ = save_lasso_name
results_round.to_csv(save_regression_path+save_lasso_name_+'.csv', index=False)

NameError: name 'results_round' is not defined