In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Cleaning mental health data subset, 2010 to 2014

https://www.icpsr.umich.edu/web/RCMD/search/studies?start=0&sort=score%20desc%2CTITLE_SORT%20asc&CLASSIF_FACET=RCMD.V.&ARCHIVE=RCMD&PUBLISH_STATUS=PUBLISHED&TIMEPERIOD_NEW=%5B2012%20TO%202022%5D&rows=50&q=Behavioral%20Risk%20Factor%20Surveillance%20System

In [12]:
#reading in 2010 to 2014 data
drug_2010 = pd.read_stata('32722-0001-Data.dta', convert_categoricals=False)
drug_2011 = pd.read_stata('34481-0001-Data.dta', convert_categoricals=False)
drug_2012 = pd.read_stata('34933-0001-Data.dta', convert_categoricals=False)
drug_2013 = pd.read_stata('35509-0001-Data.dta', convert_categoricals=False)
drug_2014 = pd.read_stata('36361-0001-Data.dta', convert_categoricals=False)

In [None]:
#testing for 2012
drug_2012 = pd.read_stata('data/2012.dta', convert_categoricals=False).set_index('CASEID')

In [3]:
#function to select mental health (and case id) variables, and rename if necessary
def get_variables(data):
    subset = data[['CASEID', 'AMHINP2', 'AMHOUTP3', 'AMHTXND2', 'AMHTXRC3', 
                   'SUICTHNK', 'SPDMON', 'SPDYR', 'MHSUITHK', 'MHSUIPLN', 
                   'MHSUITRY', 'WHODASC2', 'SMIYR_U', 'ADDPREV', 'AMDELT', 'AMDEYR']]
    renamed_subset = subset.rename(columns={'AMHINP2': 'inpatient', 'AMHOUTP3': 'outpatient', 
                                            'AMHTXND2': 'needed_not_rcvd', 'AMHTXRC3': 'rcvd_any_tmt', 
                                            'SUICTHNK': 'suic_thought_pst_12month', 'SPDMON': 'psych_distress_month', 
                                            'SPDYR': 'psych_distress_yr', 'MHSUITHK': 'suic_thought_pst_yr', 
                                            'MHSUIPLN': 'suic_plan', 'MHSUITRY': 'suic_attempt', 
                                            'WHODASC2': 'whodas_score', 'SMIYR_U': 'predicted_ment_ill', 
                                            'ADDPREV': 'svrl_days_depressed', 'AMDELT': 'lifetime_mde', 
                                            'AMDEYR': 'pst_yr_mde'})
    return renamed_subset

In [108]:
#selecting mental health subset from 2010 to 2014
mh_2010 = get_variables(drug_2010)
mh_2011 = get_variables(drug_2011)
mh_2012 = get_variables(drug_2012)
mh_2013 = get_variables(drug_2013)
mh_2014 = get_variables(drug_2014)

In [5]:
drug_2012[['CASEID', 'AMHINP2', 'AMHOUTP3', 'AMHTXND2', 'AMHTXRC3', 
                   'SUICTHNK', 'SPDMON', 'SPDYR', 'MHSUITHK', 'MHSUIPLN', 
                   'MHSUITRY', 'WHODASC2', 'SMIYR_U', 'ADDPREV', 'AMDELT', 'AMDEYR']]

KeyError: "['CASEID'] not in index"

In [134]:
#function to clean columns
def clean_cols(original):
    cleaned = original.copy()
    
    def probabilistic_imputation(col, to_null):
        cleaned[col].replace(to_null, np.nan, inplace=True)
        num_null = cleaned[col].isna().sum()
        fill_values = np.random.choice(cleaned[col].dropna(), num_null)
        cleaned.loc[cleaned[col].isna(), col] = fill_values
            
    #AMHINP2/inpatient (categorical)
    probabilistic_imputation('inpatient', -9)
    
    #AMHOUTP3/outpatient (categorical)
    probabilistic_imputation('outpatient', -9)
    
    #AMHTXND2/needed_not_rcvd (categorical)
    probabilistic_imputation('needed_not_rcvd', -9)
    
    #AMHTXRC3/rcvd_any_tmt
    probabilistic_imputation('rcvd_any_tmt', -9)
    
    #SUICTHNK/suic_thought_pst_12month (categorical)
    probabilistic_imputation('suic_thought_pst_12month', [94, 97, 98, 99])
    
    #SPDMON/psych_distress_month (categorical)
    probabilistic_imputation('psych_distress_month', -9)
    
    #SPDYR/psych_distress_yr (categorical)
    probabilistic_imputation('psych_distress_yr', -9)
    
    #MHSUITHK/suic_thought_pst_yr (categorical)
    probabilistic_imputation('suic_thought_pst_yr', -9)
    
    #MHSUIPLN/suic_plan (categorical)
    probabilistic_imputation('suic_plan', -9)
    
    #MHSUITRY/suic_attempt (categorical)
    probabilistic_imputation('suic_attempt', -9)
    
    #WHODASC2/whodas_score (numerical)
    probabilistic_imputation('whodas_score', -9)
    
    #SMIYR_U/predicted_ment_ill (categorical)
    probabilistic_imputation('predicted_ment_ill', -9)
    
    #ADDPREV/svrl_days_depressed (categorical)
    probabilistic_imputation('svrl_days_depressed', [94, 97, 98, 99])
    
    #AMDELT/lifetime_mde (categorical)
    probabilistic_imputation('lifetime_mde', -9)
    
    #AMDEYR/pst_yr_mde (categorical)
    probabilistic_imputation('pst_yr_mde', -9)
    
    #making binary variables 0/1
    for col in ['inpatient', 'outpatient', 'needed_not_rcvd', 'rcvd_any_tmt', 
                'suic_thought_pst_12month', 'svrl_days_depressed', 
                'lifetime_mde', 'pst_yr_mde']:
        cleaned[col].replace({1:0, 2:1}, inplace=True)
    
    return cleaned

In [141]:
clean_mh_2010 = clean_cols(mh_2010)

In [142]:
clean_mh_2011 = clean_cols(mh_2011)

In [143]:
clean_mh_2012 = clean_cols(mh_2012)

In [144]:
clean_mh_2013 = clean_cols(mh_2013)

In [145]:
clean_mh_2014 = clean_cols(mh_2014)

In [152]:
combined = pd.concat([clean_mh_2010, clean_mh_2011, clean_mh_2012, clean_mh_2013, clean_mh_2014])

In [153]:
combined

Unnamed: 0,CASEID,inpatient,outpatient,needed_not_rcvd,rcvd_any_tmt,suic_thought_pst_12month,psych_distress_month,psych_distress_yr,suic_thought_pst_yr,suic_plan,suic_attempt,whodas_score,predicted_ment_ill,svrl_days_depressed,lifetime_mde,pst_yr_mde
0,1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,2,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,20.0,0.0,0.0,1.0,1.0
2,3,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,4,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0
4,5,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55266,55267,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,1.0,1.0
55267,55268,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,20.0,1.0,0.0,0.0,0.0
55268,55269,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,1.0,1.0
55269,55270,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0
