In [2]:
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor


In [3]:
#this function will return a summary of columns if any missing data is found. Returns false otherwise.
def dataframe_contain_missing_data(df):
    missing = df.isnull().values.any()
    if missing:
        return df.isnull().sum()
    return false


#check if input is NaN. Old versions of python don't like this.
def isNaN(num):
    return num != num


#this function will impute missing data using the sklearn.impute iterative imputer. For numerical data only.
def impute_missing_data(column_to_impute, df):
    imputer = IterativeImputer()
    imputer.fit(df[[column_to_impute]])
    imputed_col = imputer.transform(df[[column_to_impute]])
    df[column_to_impute]=pd.Series(imputed_col.flatten())   
    return df
    
    
#this function will return the fips code based on a given state abbreviation.
def get_fips(state):
    state_dict = {
        'AL':1, 'AK':2, 'AZ':4, 'AR':5, 'CA':6,
        'CO':8, 'CT':9,'DC':11, 'DE':10, 'FL':12, 'GA':13,
        'HI':15, 'ID':16, 'IL':17, 'IN':18, 'IA':19,
        'KS':20, 'KY':21, 'LA':22, 'ME':23, 'MD':24,
        'MA':25, 'MI':26, 'MN':27, 'MS':28, 'MO':29,
        'MT':30, 'NE':31, 'NV':32, 'NH':33, 'NJ':34,
        'NM':35, 'NY':36, 'NC':37, 'ND':38, 'OH':39,
        'OK':40, 'OR':41, 'PA':42, 'RI':44, 'SC':45,
        'SD':46, 'TN':47, 'TX':48, 'UT':49, 'VT':50,
        'VA':51, 'WA':53, 'WV':54, 'WI':55, 'WY':56,
        'AS':60, 'GU':66, 'MP':69, 'PR':72, 'VI':78
    }

    if not isNaN(state):
        return state_dict[state]

In [107]:
#this function will get the cleaned dataset that will be used for modeling.
def get_data():
    return pd.read_csv("gs://additional-data/CummulatedClean_Nov22_with_lock/Update_CummulatedClean_Nov22_with_lock_0_CMaster2_HPS_CDC_CPS_Vaccinated_with_lock.csv")


#clean the data
def preprocess_data(df):
    #drop additiona location data
    df = df.drop('LOCATION', 1)
    
    #remove missing/unanswered questions
    df = df[df.ANXIOUS != -99]
    df = df[df.WORRY != -99]
    df = df[df.DOWN != -99]
    
    df = df[df.ANXIOUS != -88]
    df = df[df.WORRY != -88]
    df = df[df.DOWN != -88]
    
    df = df[df.MORTCONF != -88]
    df = df[df.INCOME != -88]
    df = df[df.MORTCONF != -99]
    df = df[df.INCOME != -99]
    df = df[df.WRKLOSS != -99]
    df = df[df.KINDWORK != -99]
    df = df[df.MORTLMTH != -99]
    df = df[df.KINDWORK != -88]
    df = df[df.MORTLMTH != -88]
    
    return df

#create summed output column
def create_mental_stress_index(df):
    df['MSI'] = df['WORRY'] + df['DOWN'] + df['ANXIOUS']
    
    #remove anxious/worry/depression columns, use index as output instead
    df = df.drop('ANXIOUS', 1)
    df = df.drop('WORRY', 1)
    df = df.drop('DOWN', 1)
    return df

    
#get the age
def get_age(x):    
    return x['YEAR'] - x['TBIRTH_YEAR']
   
    
#get high risk, i.e. summed output variable greater or equal to 7, meaning the survey has at least one 3 out of 4 selected between down, worry and anxiety.
def get_high_risk(y):
    if y['MSI'] >= 7:
        return 1
    else: 
        return 0
   

#returns the x,y data.
def split_covid_data(df):
    #define input dataset, x
    x = df.loc[:, df.columns != "MSI"]
    
    #remove YMFIPS as it is not scalar or categorical (use other columns instead)
    x = x.drop('YMFIPS', 1)
    x = x.drop('field1', 1)
    x = x.drop('FIPS', 1)
    x = x.drop('STATE_C', 1)
    x = x.drop('MONTH', 1)
    x = x.drop('people_fully_vaccinated', 1)
    x = x.drop('people_vaccinated', 1)
    x['age'] = x.apply(get_age, axis=1)
    x['age_group'] = pd.cut(x.age,bins=[0,17,34,49,65,110],labels=['Under 18','18-34','35-49','50-65','65+'])

    #create dummy categorical variables for state and month
    x1 = pd.get_dummies(x)

    #fill NaN vaccination values with 0, these are pre vaccine
    x1['people_vaccinated_per_hundred'] = x1['people_vaccinated_per_hundred'].fillna(0)
    x1['people_fully_vaccinated_per_hundred'] = x1['people_fully_vaccinated_per_hundred'].fillna(0)
    x1['CDCCOUNT'] = x1['CDCCOUNT'].fillna(0)
    x1 = x1.drop(['age'],1)
    x1 = x1.drop(['TBIRTH_YEAR'],1)

    y = df.apply(get_high_risk, axis = 1)
    
    return x1, y


In [106]:
#run me to get data!!!!
data = get_data()
data = preprocess_data(data)
data = create_mental_stress_index(data)
x, y = split_covid_data(data)
x.head(15)


Unnamed: 0,YEAR,EEDUC,THHLD_NUMPER,THHLD_NUMKID,THHLD_NUMADLT,WRKLOSS,KINDWORK,MORTLMTH,MORTCONF,INCOME,REMPCT,CDCCOUNT,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,lockdown,age_group_Under 18,age_group_18-34,age_group_35-49,age_group_50-65,age_group_65+
1,2021,4,3,0,3,2,5,1,4,6,0.032,44878.0,52.69,42.62,0,0,0,1,0,0
13,2021,7,4,2,2,1,3,1,3,5,0.048,7180.0,57.28,50.59,0,0,1,0,0,0
15,2021,6,2,0,2,2,1,1,4,6,0.048,7180.0,57.28,50.59,0,0,0,0,1,0
25,2021,6,1,0,1,2,1,1,4,3,0.043,58816.0,54.9,45.43,0,0,0,0,0,1
27,2021,3,2,1,1,2,1,1,3,5,0.056,75722.0,59.79,49.78,0,0,0,0,1,0
29,2021,7,1,0,1,2,2,1,4,8,0.042,104.0,66.94,57.27,0,0,1,0,0,0
32,2021,6,4,3,1,2,2,1,4,6,0.088,113702.0,71.73,58.98,0,0,0,1,0,0
35,2021,6,1,0,1,2,3,1,4,4,0.067,20024.0,65.59,59.38,0,0,0,0,1,0
40,2021,6,2,0,2,2,3,2,4,3,0.067,20024.0,65.59,59.38,0,0,0,1,0,0
42,2021,7,2,0,2,2,1,1,4,8,0.067,20024.0,65.59,59.38,0,0,0,1,0,0


In [103]:
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(x, y)

RandomForestClassifier(max_depth=10, random_state=0)

In [104]:
clf.score(x, y)

0.7386975628745881

In [105]:
importance = pd.DataFrame({'column':x.columns.tolist()})
importance['value'] = clf.feature_importances_

importance.sort_values(by=['value'], ascending=False).head(25)

Unnamed: 0,column,value
8,MORTCONF,0.474794
5,WRKLOSS,0.124381
9,INCOME,0.084444
16,age_group_18-34,0.053781
7,MORTLMTH,0.052028
18,age_group_50-65,0.031637
19,age_group_65+,0.03116
12,people_vaccinated_per_hundred,0.022062
13,people_fully_vaccinated_per_hundred,0.020525
17,age_group_35-49,0.019869


In [173]:
#only run me if you want to push your computer to the limit

param_grid = {
    'bootstrap': [True],
    'max_depth': [20, 80],
    'max_features': [2, 6],
    'min_samples_leaf': [3, 6],
    'min_samples_split': [8, 12],
    'n_estimators': [100, 1000]
}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 5, n_jobs = -1, verbose = 2)

grid_search.fit(x, y)