## Load data

In [1]:
import pandas as pd
import ast
import numpy as np

# Load in data
admissions = 'tedsa_puf_2019.csv'
df_raw = pd.read_csv(f'../../Downloads/{admissions}')

## Filter out select rows and columns

In [2]:
# Get count of original number of rows
old_rows = len(df_raw)

# Drop the defined columns: year of admission (singular), case id (index), geographic metro area (high cardinality)
columns_to_drop = ['ADMYR', 'CASEID', 'CBSA2010']
df = df_raw.drop(columns=columns_to_drop)
print(f'Dropped {len(columns_to_drop)} columns ({len(df.columns)} remain)')

# Drop values where dependent variable is unknown
df = df[df['METHUSE'] != -9]

# Only keep patients admitted with self-described use of an opioid as their primary substance use (i.e., SUB1 = 5, 6, or 7)
df = df[df['SUB1'].between(5, 7)]
new_rows = len(df)
percent_change = round(100*(old_rows-new_rows)/old_rows, 1)
print(f'Dropped {"{:,}".format(old_rows-new_rows)} observations or {percent_change}% of the data ({"{:,}".format(new_rows)} rows remain)')

df = df.reset_index(drop='index')

Dropped 3 columns (59 remain)
Dropped 1,340,233 observations or 71.9% of the data (524,134 rows remain)


## Make dataset human-readable

In [9]:
# Load in variable dictionary
with open('VariableDictionary.txt') as file:
    variable_dict_string = file.read()
    variable_dict = ast.literal_eval(variable_dict_string)

# Rename entries in column according to dictionary
df2 = df.copy()
for col, col_dict in variable_dict.items():
    if col in df2.columns:
        for old_value, new_value in variable_dict[col].items():
            df2[col] = df2[col].replace(old_value, new_value)

# Rename "-9" values as "Unknown"
for col in df2.columns:
    df2[col] = df2[col].replace(-9, 'Unknown')

# Subsume DETNLF (detailed not in labor force) into EMPLOY==4 (not in labor force)
detailed_employ = []
for idx, value in df2.iterrows():
    if value['EMPLOY'] == 'NotInLaborForce':
        if value['DETNLF'] == 'Unknown':
            # Assign 'UnknownNotInLaborForce' if 'NotInLaborForce' and 'Unknown'
            detailed_employ.append('UnknownNotInLaborForce')
        else:
            # Otherwise, assign as the DETNLF value
            detailed_employ.append(value['DETNLF'])
    else:
        # Assign the EMPLOY value if not 'NotInLaborForce'
        detailed_employ.append(value['EMPLOY'])
# Add a new column for detailed employment and drop the two source columns
df2['DETEMPLOY'] = detailed_employ
df2 = df2.drop(columns=['EMPLOY', 'DETNLF'])

# Subsume DETCRIM (detailed court referral) into PSOURCE==7 (court referral)
detailed_court_referral = []
for idx, value in df2.iterrows():
    if value['PSOURCE'] == 'CourtReferral':
        if value['DETCRIM'] == 'Unknown':
            # Assign 'UnknownCourtReferral' if 'CourtReferral' and 'Unknown'
            detailed_court_referral.append('UnknownCourtReferral')
        else:
            # Otherwise, assign as the DETCRIM value
            detailed_court_referral.append(value['DETCRIM'])
    else:
        # Assign the PSOURCE value if not 'CourtReferral'
        detailed_court_referral.append(value['PSOURCE'])
# Add a new column for detailed court referral and drop the two source columns
df2['DETPSOURCE'] = detailed_court_referral
df2 = df2.drop(columns=['PSOURCE', 'DETCRIM'])

In [10]:
df2

Unnamed: 0,STFIPS,EDUC,MARSTAT,SERVICES,NOPRIOR,ARRESTS,METHUSE,PSYPROB,PREG,GENDER,...,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,DIVISION,REGION,IDU,ALCDRUG,DETEMPLOY,DETPSOURCE
0,AK,Grade9To11,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,0Arrest,NoMethUse,No,Unknown,Male,...,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,AlcoholAndDrugs,Unemployed,Individual
1,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,2PriorTreatments,0Arrest,MethUse,No,NotPregnant,Female,...,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs,OtherNotInLaborForce,OtherHealthCareProvider
2,AK,Grade8OrLess,NowMarried,AmbulatoryNonIntensiveOutpatient,5PlusPriorTreatments,0Arrest,MethUse,Yes,NotPregnant,Female,...,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs,RetiredOrDisabled,DrugCareProvider
3,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,0Arrest,MethUse,No,NotPregnant,Female,...,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs,OtherNotInLaborForce,OtherHealthCareProvider
4,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,1PriorTreatments,0Arrest,MethUse,Yes,NotPregnant,Female,...,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs,PartTime,DrugCareProvider
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524129,WY,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,0PriorTreatments,0Arrest,NoMethUse,No,Unknown,Male,...,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs,Unemployed,OtherReferral
524130,WY,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,2PriorTreatments,0Arrest,NoMethUse,No,Unknown,Male,...,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs,Unemployed,Court
524131,WY,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,0Arrest,NoMethUse,Yes,Unknown,Male,...,Reported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,AlcoholAndDrugs,Unemployed,DrugCareProvider
524132,WY,Grade8OrLess,NeverMarried,AmbulatoryNonIntensiveOutpatient,0PriorTreatments,0Arrest,NoMethUse,No,Unknown,Male,...,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs,InstitutionResident,ProbationOrParole


## Percent receiving MOUD by variable subgroup

In [11]:
# Calculate the treatment rate for each subgroup
total = len(df2)
df3 = df2.copy()
df3 = df3.replace({'MethUse':1, 'NoMethUse':0})

# Loop through each variable, append frequencies to a running dataframe
df_freq = pd.DataFrame()
for col in df3.columns:
    # Calculate treatment rates
    df_mean = df3.groupby([col])['METHUSE'].mean()
    df_mean = pd.DataFrame(df_mean)

    # Calculate admission counts
    df_count = df3.groupby([col])['METHUSE'].count()
    df_count = pd.DataFrame(df_count)

    # Combine, rename columns
    df_running = pd.merge(df_mean, df_count, how='inner', left_index=True, right_index=True)
    df_running = df_running.reset_index()
    df_running = df_running.rename(columns={'METHUSE_x': 'percent_receiving_moud', 'METHUSE_y': 'total_admissions', col: 'subgroup'})

    # Calculate share of admissions, reorder columns, add group
    df_running['group_share_of_admissions'] = df_running['total_admissions']/total
    df_running = df_running[['subgroup', 'total_admissions', 'group_share_of_admissions', 'percent_receiving_moud']]
    df_running.insert(0, 'group', col)

    # Append results before moving to next columns
    df_freq = pd.concat([df_freq, df_running])

# All row for all observations
df_freq = df_freq.reset_index(drop=True)
df_freq2 = pd.DataFrame(data={'group':'ALL',
                            'subgroup':'ALL',
                            'total_admissions':len(df3),
                            'group_share_of_admissions':1.0,
                            'percent_receiving_moud':df3['METHUSE'].mean()
                            }, index=[354])
df_freq = pd.concat([df_freq, df_freq2])

df_freq

Unnamed: 0,group,subgroup,total_admissions,group_share_of_admissions,percent_receiving_moud
0,STFIPS,AK,1603,0.003058,0.492826
1,STFIPS,AL,5774,0.011016,0.297194
2,STFIPS,AR,1947,0.003715,0.337956
3,STFIPS,AZ,10189,0.019440,0.181176
4,STFIPS,CA,42031,0.080191,0.587804
...,...,...,...,...,...
349,DETPSOURCE,ProbationOrParole,24201,0.046173,0.232180
350,DETPSOURCE,School,188,0.000359,0.250000
351,DETPSOURCE,Unknown,10708,0.020430,0.395405
352,DETPSOURCE,UnknownCourtReferral,17957,0.034260,0.140502


In [12]:
# Uncomment to save results
# df_freq.to_csv('percent_receiving_moud.csv', index=False)

# Describe MOUD treatment frequency for each variable's subgroup and living status
Calculate the percent difference between the homeless vs independent and dependent vs independent as well as their statistical difference

In [13]:
# Define helper function
def significant(p_value):
    if p_value < 0.001:
        significance = '****'
    elif p_value < 0.01:
        significance = '***'
    elif p_value < 0.05:
        significance = '**'
    elif p_value <0.1:
        significance = '*'
    else:
        significance = ''
    return significance

In [14]:
# Test out calculate treatment difference for each group
import pandas as pd
from scipy.stats import ttest_ind

# Simplify dataset
df_ttest = df2.copy()
df_ttest = df_ttest[df_ttest['LIVARAG'] != 'Unknown']
# df_ttest = df_ttest[df_ttest['LIVARAG'] != 'DependLiving']
df_ttest['LIVARAG'] = df_ttest['LIVARAG'].replace({'DependLiving':'Dependent', 'IndependentLiving':'Independent'})
df_ttest['METHUSE'] = df_ttest['METHUSE'].replace({'MethUse':1, 'NoMethUse':0})


df_all = pd.DataFrame(columns=['Dependent', 'Homeless', 'Independent', 'DependentMOUD', 'HomelessMOUD', 'IndependentMOUD',
                               'PercentDifference', 'TStat', 'PValue', 'Significance'])

df_columns = [c for c in df_ttest.columns if c != 'LIVARAG']
df_columns.append('ALL')

for col in df_columns:
    # Create loop assets (depending on if subgroup or not)
    if col == 'ALL':
        col_list = ['ALL']
    else:
        col_array = df_ttest[col].sort_values().unique()
        col_list = col_array.tolist()
    df_col = pd.DataFrame(columns=['Group', 'DependentAdmissions', 'HomelessAdmissions', 'IndependentAdmissions',
                                   'DependentMOUD', 'HomelessMOUD', 'IndependentMOUD',
                                   'DependentPercentDifference', 'DependentTStat', 'DependentPValue', 'DependentSignificance',
                                   'HomelessPercentDifference', 'HomelessTStat', 'HomelessPValue', 'HomelessSignificance'])

    # Loop through each subgroup within each column
    for s in col_list:
        # Filter observations based on subset (unless "ALL")
        if col == 'ALL':
            dft_temp = df_ttest.copy()
        else:
            dft_temp = df_ttest[df_ttest[col] == s]

        # Group by living arrangement and MOUD, then store those variables for later (if they exist)
        dft_count = dft_temp.groupby('LIVARAG')['METHUSE'].count()
        try:
            dependent_count = dft_count.loc['Dependent']
        except KeyError:
            dependent_count = 0

        try:
            homeless_count = dft_count.loc['Homeless']
        except KeyError:
            homeless_count = 0

        try:
            independent_count = dft_count.loc['Independent']
        except KeyError:
            independent_count = 0

        # Define averages by living arrangement (use try-except in case there are no observations)
        dft_mean = dft_temp.groupby('LIVARAG')['METHUSE'].mean()
        try:
            dependent_moud = dft_mean.loc['Dependent']
        except KeyError:
            dependent_moud = np.nan

        try:
            homeless_moud = dft_mean.loc['Homeless']
        except KeyError:
            homeless_moud = np.nan

        try:
            independent_moud = dft_mean.loc['Independent']
        except KeyError:
            independent_moud = np.nan

        # Calculate the percent difference (if there is an independent living group in that state)
        if independent_moud > 0:
           dependent_percent_difference = (dependent_moud - independent_moud)/independent_moud
           homeless_percent_difference = (homeless_moud - independent_moud)/independent_moud
        else:
            dependent_percent_difference = np.nan
            homeless_percent_difference = np.nan

        # Perform a t-test between the two groups
        dependent_values = dft_temp[dft_temp['LIVARAG'] == 'Dependent']['METHUSE']
        homeless_values = dft_temp[dft_temp['LIVARAG'] == 'Homeless']['METHUSE']
        independent_values = dft_temp[dft_temp['LIVARAG'] == 'Independent']['METHUSE']

        dependent_t_stat, dependent_p_value = ttest_ind(dependent_values, independent_values)
        homeless_t_stat, homeless_p_value = ttest_ind(homeless_values, independent_values)

        # Add values to a dictionary and round
        dict_results = {'DependentAdmissions':dependent_count, 'HomelessAdmissions':homeless_count, 'IndependentAdmissions':independent_count,
                        'DependentMOUD':dependent_moud, 'HomelessMOUD':homeless_moud, 'IndependentMOUD':independent_moud,
                        'DependentPercentDifference':dependent_percent_difference, 'DependentTStat':dependent_t_stat, 'DependentPValue':dependent_p_value,
                        'HomelessPercentDifference':homeless_percent_difference, 'HomelessTStat':homeless_t_stat, 'HomelessPValue':homeless_p_value}
        for key, value in dict_results.items():
            dict_results[key] = round(value, 6)

        # Add significance and column group to dictionary, then add dictionary to dataframe (using helper function)
        dict_results['DependentSignificance'] = significant(dependent_p_value)
        dict_results['HomelessSignificance'] = significant(homeless_p_value)
        dict_results['Group'] = col
        df_col.loc[s] = dict_results

    # Add column df to running aggregate
    df_all = pd.concat([df_all, df_col])

# Clean up final column
df_all = df_all.reset_index().rename(columns={'index': 'Subgroup'})
df_all = df_all[['Group', 'Subgroup', 'DependentAdmissions', 'HomelessAdmissions', 'IndependentAdmissions', 'DependentMOUD', 'HomelessMOUD', 'IndependentMOUD',
                 'DependentPercentDifference', 'DependentTStat', 'DependentPValue', 'DependentSignificance',
                 'HomelessPercentDifference', 'HomelessTStat', 'HomelessPValue', 'HomelessSignificance']]

  **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [16]:
# Merge dataframes together
df_combo = pd.merge(df_freq, df_all, left_on=['group', 'subgroup'], right_on=['Group', 'Subgroup'], how='outer')
df_combo = df_combo.drop(columns=['Group', 'Subgroup'])

# Add formatted column of percent difference with significance (for dependent and homeless)
df_combo['DependentPctDiff'] = df_combo['DependentPercentDifference']*100
df_combo['DependentPctDiff'] = df_combo['DependentPctDiff'].round(2).map('{:.2f}'.format)
df_combo['DependentPctDiff'] = df_combo['DependentPctDiff'] + '%' + df_combo['DependentSignificance']
df_combo['DependentPctDiff'] = df_combo['DependentPctDiff'].replace('nan%', '')

df_combo['HomelessPctDiff'] = df_combo['HomelessPercentDifference']*100
df_combo['HomelessPctDiff'] = df_combo['HomelessPctDiff'].round(2).map('{:.2f}'.format)
df_combo['HomelessPctDiff'] = df_combo['HomelessPctDiff'] + '%' + df_combo['HomelessSignificance']
df_combo['HomelessPctDiff'] = df_combo['HomelessPctDiff'].replace('nan%', '')

# df_combo.to_csv('frequencies.csv', index=False) #uncomment to save file
df_combo

Unnamed: 0,group,subgroup,total_admissions,group_share_of_admissions,percent_receiving_moud,DependentAdmissions,HomelessAdmissions,IndependentAdmissions,DependentMOUD,HomelessMOUD,...,DependentPercentDifference,DependentTStat,DependentPValue,DependentSignificance,HomelessPercentDifference,HomelessTStat,HomelessPValue,HomelessSignificance,DependentPctDiff,HomelessPctDiff
0,STFIPS,AK,1603,0.003058,0.492826,258,333,956,0.372093,0.387387,...,-0.332606,-5.347493,0.000000,****,-0.305174,-5.405923,0.000000,****,-33.26%****,-30.52%****
1,STFIPS,AL,5774,0.011016,0.297194,1952,379,3218,0.296619,0.118734,...,-0.080424,-1.950745,0.051141,*,-0.631903,-8.256698,0.000000,****,-8.04%*,-63.19%****
2,STFIPS,AR,1947,0.003715,0.337956,604,203,1137,0.261589,0.103448,...,-0.375153,-6.558287,0.000000,****,-0.752898,-8.801753,0.000000,****,-37.52%****,-75.29%****
3,STFIPS,AZ,10189,0.019440,0.181176,,,,,,...,,,,,,,,,,
4,STFIPS,CA,42031,0.080191,0.587804,9353,10568,22090,0.555330,0.344436,...,-0.226140,-28.314917,0.000000,****,-0.520023,-68.824720,0.000000,****,-22.61%****,-52.00%****
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,DETPSOURCE,ProbationOrParole,24201,0.046173,0.232180,5176,2416,16248,0.181221,0.215646,...,-0.286533,-10.760649,0.000000,****,-0.151003,-4.069065,0.000047,****,-28.65%****,-15.10%****
350,DETPSOURCE,School,188,0.000359,0.250000,26,24,133,0.192308,0.291667,...,-0.200721,-0.530104,0.596789,,0.212240,0.529958,0.596899,,-20.07%,21.22%
351,DETPSOURCE,Unknown,10708,0.020430,0.395405,1203,895,5049,0.457190,0.211173,...,0.704841,12.970883,0.000000,****,-0.212545,-3.588038,0.000336,****,70.48%****,-21.25%****
352,DETPSOURCE,UnknownCourtReferral,17957,0.034260,0.140502,3729,1447,10986,0.179673,0.163787,...,0.422108,8.130652,0.000000,****,0.296373,3.974403,0.000071,****,42.21%****,29.64%****
