In [1]:
from health_scores import HealthScores
from Model import Model
import pandas as pd
import numpy as np

#### Load Master dataset in raw format (All variables we have collected) So many variables have nulls. In the code null values are handeled by imputing with the group means of first 8-digit geoids(which represents nearest census tracts). 

In [2]:
df_raw = pd.read_csv(
    "master_raw_data.csv"
)  # file from the 'Data collection.ipynb'
print(df_raw.shape)
print(df_raw.columns)

(1172, 55)
Index(['Unnamed: 0', 'COI_FOOD', 'COI_GREEN', 'COI_WALK', 'COI_VACANCY',
       'COI_SUPRFND', 'COI_RSEI', 'COI_PM25', 'COI_OZONE', 'COI_HEAT',
       'COI_HLTHINS', 'latitude', 'longitude', 'geoid', 'countyfips',
       'TractFIPS', 'County', 'StateAbbr', 'PlaceName', 'PlaceFIPS',
       'Place_TractID', 'Population2010', 'ACCESS2_CrudePrev',
       'ARTHRITIS_CrudePrev', 'BINGE_CrudePrev', 'BPHIGH_CrudePrev',
       'BPMED_CrudePrev', 'CANCER_CrudePrev', 'CASTHMA_CrudePrev',
       'CHD_CrudePrev', 'CHECKUP_CrudePrev', 'CHOLSCREEN_CrudePrev',
       'COLON_SCREEN_CrudePrev', 'COPD_CrudePrev', 'COREM_CrudePrev',
       'COREW_CrudePrev', 'CSMOKING_CrudePrev', 'DENTAL_CrudePrev',
       'DIABETES_CrudePrev', 'HIGHCHOL_CrudePrev', 'KIDNEY_CrudePrev',
       'LPA_CrudePrev', 'MAMMOUSE_CrudePrev', 'MHLTH_CrudePrev',
       'OBESITY_CrudePrev', 'PAPTEST_CrudePrev', 'PHLTH_CrudePrev',
       'SLEEP_CrudePrev', 'STROKE_CrudePrev', 'TEETHLOST_CrudePrev',
       'life expectancy', '

From these variables you can pick any variables to include in the model and get the results.(Method is explained below)

### More advanced imputation method 

Equal missing data to the nearest neighborhood - (distance between Latitude/Longitude points was used, see https://www.movable-type.co.uk/scripts/latlong.html for details)

In [3]:
# More advanced imputation method
df_imputed_x = pd.read_excel("20200420_input_final.xlsx")
df_imputed_x.head()

Unnamed: 0,geoid,longitude,latitude,ED_PRXECE,ED_PRXHQECE,ED_ECENROL,ED_READING,ED_MATH,ED_HSGRAD,ED_APENR,...,PAPTEST_CrudePrev,PHLTH_CrudePrev,SLEEP_CrudePrev,STROKE_CrudePrev,TEETHLOST_CrudePrev,Alcohol Test,Drug Test,Pedalcyclist,Pedastrian,life expectancy
0,48085030100,-96.3982,33.29592,1.343954,-13.815511,30.9,217.85674,250.22748,94.002556,0.278373,...,78.3,11.9,35.4,2.7,18.8,0.821918,0.821918,0.0,1.09589,76.7
1,48085030201,-96.53734,33.26331,2.069664,-13.815511,61.0,220.04181,246.44695,87.928993,0.28771,...,78.3,11.9,35.4,2.7,18.8,0.0,0.0,0.0,0.0,
2,48085030202,-96.64279,33.34124,1.751906,-13.815511,0.0,233.74568,262.12021,80.740799,0.568353,...,81.4,8.8,35.0,1.6,9.5,4.464286,1.116071,0.0,2.232143,78.8
3,48085030203,-96.54547,33.34176,2.338918,-13.815511,32.7,226.88499,261.4353,95.360466,0.290443,...,81.4,8.8,35.0,1.6,9.5,0.0,0.0,0.163881,0.327761,78.2
4,48085030302,-96.75005,33.24045,3.588619,-13.815511,62.3,250.81639,274.56683,96.399155,0.650187,...,82.05,9.15,32.95,1.95,7.75,0.0,0.0,0.407166,0.407166,82.0


### Steps to get the model output:

#### 1. Initialize Model() class by passing the data file.
#### 2. Call the model_output() method with the below arguments.
            
        columns_regress(list) : x variables to include in the regression model
        target(string) : target variable(y) in the regression model
        multiply_cols(dict): dictionary with columns and thier multiplier(-1,1) as key-value pairs to rescale all variables
        as high is good.

        Default Arguments : Has default values that can be changed as per requirement.
        
        columns_impute(list) : columns_impute to be imputed(default is None)
        winsorize_outliers(dict) : dictionary of limits for the respective columns{'col' : limit} (default is None)
        winsorize_with_95(boolean) : winsorize all columns with 95 percentile(True or False) (default - False)
        target_multiplier(int) : to change the direction of y variable if needed(default is 1)

####        IMPORTANT : Follow the arguments order or specify the argument name when calling the method.
Example : model1.model_output(columns_regress,target,multiply_cols,target_multiplier = target_multiplier)


        Returns :returns all census tracts transformed data(high is good).
                :prints model summary.
                :returns model weights.
                
#####        Store the data and model weights to get the cummulative health score

## Example :

In [4]:
data = df_imputed_x.copy()  # data
model1 = Model(data)  # Initializing class variable

# variables list to include in our regression model.
columns_regress = [
    "HE_FOOD",
    "HE_WALK",
    "HE_VACANCY",
    "HE_SUPRFND",
    "HE_HLTHINS",
    "BINGE_CrudePrev",
    "CHECKUP_CrudePrev",
    "BPHIGH_CrudePrev",
    "SLEEP_CrudePrev",
    "STROKE_CrudePrev",
    "Drug Test",
    "Pedalcyclist",
]

# multipliers for each variable to rescale the variables as higher the value better for the health score.
multiply_cols = {
    "HE_FOOD": -1,
    "HE_WALK": 1,
    "HE_VACANCY": -1,
    "HE_SUPRFND": -1,
    "HE_HLTHINS": 1,
    "BINGE_CrudePrev": -1,
    "CHECKUP_CrudePrev": 1,
    "BPHIGH_CrudePrev": -1,
    "SLEEP_CrudePrev": -1,
    "STROKE_CrudePrev": -1,
    "Drug Test": -1,
    "Pedalcyclist": -1,
}

# target variable
target = "life expectancy"

# storing the data and model weights to calculate health score.
multiplied_zscore_data_le, params_le = model1.model_output(
    columns_regress, target, multiply_cols
)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.558
Model:                            OLS   Adj. R-squared:                  0.553
Method:                 Least Squares   F-statistic:                     103.9
Date:                Tue, 28 Apr 2020   Prob (F-statistic):          1.35e-165
Time:                        09:32:30   Log-Likelihood:                -1009.2
No. Observations:                 999   AIC:                             2044.
Df Residuals:                     986   BIC:                             2108.
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              5.117e-16      0.02

In [5]:
columns_regress = [
    "HE_FOOD",
    "HE_WALK",
    "HE_VACANCY",
    "HE_SUPRFND",
    "HE_HLTHINS",
    "BINGE_CrudePrev",
    "CHECKUP_CrudePrev",
    "BPHIGH_CrudePrev",
    "SLEEP_CrudePrev",
    "STROKE_CrudePrev",
    "Drug Test",
    "Pedalcyclist",
]

multiply_cols = {
    "HE_FOOD": -1,
    "HE_WALK": 1,
    "HE_VACANCY": -1,
    "HE_SUPRFND": -1,
    "HE_HLTHINS": 1,
    "BINGE_CrudePrev": -1,
    "CHECKUP_CrudePrev": 1,
    "BPHIGH_CrudePrev": -1,
    "SLEEP_CrudePrev": -1,
    "STROKE_CrudePrev": -1,
    "Drug Test": -1,
    "Pedalcyclist": -1,
}

target = "PHLTH_CrudePrev"
target_multiplier = -1

multiplied_zscore_data_1, params_1 = model1.model_output(
    columns_regress, target, multiply_cols, target_multiplier=target_multiplier
)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.917
Model:                            OLS   Adj. R-squared:                  0.916
Method:                 Least Squares   F-statistic:                     906.4
Date:                Tue, 28 Apr 2020   Prob (F-statistic):               0.00
Time:                        09:32:30   Log-Likelihood:                -175.01
No. Observations:                 999   AIC:                             376.0
Df Residuals:                     986   BIC:                             439.8
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const             -4.337e-18      0.00

In [6]:
# regression of MHLTH_CrudePrev


columns_regress = [
    "HE_FOOD",
    "HE_WALK",
    "HE_VACANCY",
    "HE_SUPRFND",
    "HE_HLTHINS",
    "BINGE_CrudePrev",
    "CHECKUP_CrudePrev",
    "BPHIGH_CrudePrev",
    "SLEEP_CrudePrev",
    "STROKE_CrudePrev",
    "Drug Test",
    "Pedalcyclist",
]

multiply_cols = {
    "HE_FOOD": -1,
    "HE_WALK": 1,
    "HE_VACANCY": -1,
    "HE_SUPRFND": -1,
    "HE_HLTHINS": 1,
    "BINGE_CrudePrev": -1,
    "CHECKUP_CrudePrev": 1,
    "BPHIGH_CrudePrev": -1,
    "SLEEP_CrudePrev": -1,
    "STROKE_CrudePrev": -1,
    "Drug Test": -1,
    "Pedalcyclist": -1,
}


target = "MHLTH_CrudePrev"
target_multiplier = -1

multiplied_zscore_data_2, params_2 = model1.model_output(
    columns_regress, target, multiply_cols, target_multiplier=target_multiplier
)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.880
Model:                            OLS   Adj. R-squared:                  0.878
Method:                 Least Squares   F-statistic:                     600.6
Date:                Tue, 28 Apr 2020   Prob (F-statistic):               0.00
Time:                        09:32:30   Log-Likelihood:                -359.89
No. Observations:                 999   AIC:                             745.8
Df Residuals:                     986   BIC:                             809.6
Df Model:                          12                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              5.169e-16      0.01

## Steps to get the health scores :

#### 1. Initialize the HealthScores() by passing the below arguments.

        Arguments:
        weights_1(np array) : weights of model 1 (Physical health).
        weights_2(np array) : weights of model 2 (Mental health).
        weights_3(np array) : weights of model 3 (Life Expectancy).
        multiplied_data(df) : all cenusus tracts data which needs to be multiplied with weights to get health scores
        geoid(Series) : geoids of census tracts to concatenate with our health scores data.
        
        Default Arguments :
        
        is_weighted_average(boolean) : weights calculation methodology(default is True)
        weightage(list) : weightage for each y-variable(default : [0.25,0.25,0.5])
        
#### 2. Call the final_scaled_data() method

        Returns : 
            final_data(df) : dataframe with all health scores and geoids
            weights_tables(df) : dataframe with each y-variable weights and averaged weights

In [7]:
health_scores1 = HealthScores(
    params_1, params_2, params_le, multiplied_zscore_data_1, data["geoid"]
)
final_data, weights_table = health_scores1.final_scaled_data()

health_scores1.weights  # Can access the weights directly like this.

array([0.0117727 , 0.00751751, 0.01959428, 0.01042273, 0.128936  ,
       0.1184022 , 0.19513146, 0.14433487, 0.19330818, 0.16154886,
       0.00396387, 0.00506735])

In [8]:
final_data.head()

Unnamed: 0,geoid,HE_FOOD,HE_WALK,HE_VACANCY,HE_SUPRFND,HE_HLTHINS,BINGE_CrudePrev,CHECKUP_CrudePrev,BPHIGH_CrudePrev,SLEEP_CrudePrev,STROKE_CrudePrev,Drug Test,Pedalcyclist,health_scores
0,48085030100,96.091281,2.544479,76.981603,50.0,13.890877,74.926254,9.292503,78.357236,78.62069,79.207921,98.520548,100.0,69.027647
1,48085030201,98.439143,11.52469,92.643096,50.0,15.042605,74.926254,9.292503,78.357236,78.62069,79.207921,100.0,100.0,71.151708
2,48085030202,95.919226,7.48286,67.903118,50.0,18.821577,57.227139,5.385428,85.658409,79.310345,90.09901,97.991071,100.0,71.239647
3,48085030203,98.935264,13.103215,83.265171,50.0,22.212623,57.227139,5.385428,85.658409,79.310345,90.09901,100.0,99.414712,74.373851
4,48085030302,96.612195,13.885353,79.53682,50.0,28.75872,70.058997,9.926082,80.312907,82.844828,86.633663,100.0,98.545835,83.92669


In [9]:
weights_table

Unnamed: 0,phy_health_weights,mntl_health_weights,life_expectancy_weights,averaged_weights
HE_FOOD,-0.056542,-0.014846,0.08907,0.011773
HE_WALK,-0.073147,-0.043824,0.092569,0.007518
HE_VACANCY,-0.038049,-0.009083,0.112404,0.019594
HE_SUPRFND,-0.06099,-0.054371,0.104936,0.010423
HE_HLTHINS,0.2803,0.26671,0.311076,0.128936
BINGE_CrudePrev,0.328551,0.464386,0.140354,0.118402
CHECKUP_CrudePrev,0.469246,0.626203,0.33698,0.195131
BPHIGH_CrudePrev,0.530143,0.298016,0.240319,0.144335
SLEEP_CrudePrev,0.384115,0.753666,0.307548,0.193308
STROKE_CrudePrev,0.505871,0.410902,0.274058,0.161549
