## Load data

In [1]:
import pandas as pd
import ast
import numpy as np

# Load in data
admissions = 'tedsa_puf_2019.csv'
df_raw = pd.read_csv(f'../../Downloads/{admissions}')

## Filter out select rows and columns

In [2]:
# Get count of original number of rows
old_rows = len(df_raw)

# Drop defined columns (year of admission, case id, geographic metro area, geographic division, geographic region)
columns_to_drop = ['ADMYR', 'CASEID', 'CBSA2010', 'DIVISION', 'REGION']
df = df_raw.drop(columns=columns_to_drop)
print(f'Dropped {len(columns_to_drop)} columns ({len(df.columns)} remain)')

# Drop values where dependent variable is unknown
df = df[df['METHUSE'] != -9]

# Only keep patients admitted with self-described use of an opioid as their primary substance use (i.e., SUB1 = 5, 6, or 7)
df = df[df['SUB1'].between(5, 7)]
new_rows = len(df)
percent_change = round(100*(old_rows-new_rows)/old_rows, 1)
print(f'Dropped {"{:,}".format(old_rows-new_rows)} observations or {percent_change}% of the data ({"{:,}".format(new_rows)} rows remain)')

df = df.reset_index(drop='index')

Dropped 5 columns (57 remain)
Dropped 1,340,233 observations or 71.9% of the data (524,134 rows remain)


## Make dataset human-readable

In [3]:
# Load in variable dictionary
with open('VariableDictionary.txt') as file:
    variable_dict_string = file.read()
    variable_dict = ast.literal_eval(variable_dict_string)

# Rename entries in column according to dictionary
df2 = df.copy()
for col, col_dict in variable_dict.items():
    if col in df2.columns:
        for old_value, new_value in variable_dict[col].items():
            df2[col] = df2[col].replace(old_value, new_value)

# Rename "-9" values as "Unknown"
for col in df2.columns:
    df2[col] = df2[col].replace(-9, 'Unknown')

# Merge DETNLF (detailed not in labor force) into EMPLOY==4 (not in labor force)
detailed_employ = []

for idx, value in df2.iterrows():
    if value['EMPLOY'] == 'NotInLaborForce':
        if value['DETNLF'] == 'Unknown':
            # Assign 'UnknownNotInLaborForce' if 'NotInLaborForce' and 'Unknown'
            detailed_employ.append('UnknownNotInLaborForce')
        else:
            # Otherwise, assign as the DETNLF value
            detailed_employ.append(value['DETNLF'])
    else:
        # Assign the EMPLOY value if not 'NotInLaborForce'
        detailed_employ.append(value['EMPLOY'])

# Add a new column for detailed employment and drop the two source columns
df2['DETEMPLOY'] = detailed_employ
df2 = df2.drop(columns=['EMPLOY', 'DETNLF'])

## Percent receiving MOUD by variable subgroup

In [105]:
total = len(df2)
df3 = df2.copy()
df3 = df3.replace({'MethUse':1, 'NoMethUse':0})

df_freq = pd.DataFrame()

for col in df3.columns:

    dff = df3.groupby([col])['METHUSE'].mean()
    dff = pd.DataFrame(dff)

    dff2 = df3.groupby([col])['METHUSE'].count()
    dff2 = pd.DataFrame(dff2)

    dff3 = pd.merge(dff, dff2, how='inner', left_index=True, right_index=True)
    dff3 = dff3.reset_index()
    dff3 = dff3.rename(columns={'METHUSE_x': 'percent_receiving_moud', 'METHUSE_y': 'total_admissions', col: 'subgroup'})
    dff3['group_share_of_admissions'] = dff3['total_admissions']/total
    dff3 = dff3[['subgroup', 'total_admissions', 'group_share_of_admissions', 'percent_receiving_moud']]
    dff3.insert(0, 'group', col)

    df_freq = pd.concat([df_freq, dff3])

df_freq = df_freq.reset_index(drop=True)
df_freq

Unnamed: 0,group,subgroup,total_admissions,group_share_of_admissions,percent_receiving_moud
0,STFIPS,AK,1603,0.003058,0.492826
1,STFIPS,AL,5774,0.011016,0.297194
2,STFIPS,AR,1947,0.003715,0.337956
3,STFIPS,AZ,10189,0.019440,0.181176
4,STFIPS,CA,42031,0.080191,0.587804
...,...,...,...,...,...
334,DETEMPLOY,RetiredOrDisabled,38813,0.074052,0.489759
335,DETEMPLOY,Student,1784,0.003404,0.314462
336,DETEMPLOY,Unemployed,191417,0.365206,0.372052
337,DETEMPLOY,Unknown,23056,0.043989,0.573603


In [107]:
df_freq.to_csv('percent_receiving_moud.csv', index=False)

# Describe MOUD treatment frequency by state and living status
Calculate the percent difference between the homeless and housed as well as its statistical difference

In [18]:
import pandas as pd
from scipy.stats import ttest_ind

# Simplify dataset
df_ttest = df2[['STFIPS', 'LIVARAG', 'METHUSE']]
df_ttest = df_ttest[df_ttest['LIVARAG'] != 'Unknown']
# df_ttest = df_ttest[df_ttest['LIVARAG'] != 'DependLiving']
df_ttest['LIVARAG'] = df_ttest['LIVARAG'].replace({'DependLiving':'Housed', 'IndependentLiving':'Housed'})
df_ttest['METHUSE'] = df_ttest['METHUSE'].replace({'MethUse':1, 'NoMethUse':0})

# Create loop assets
state_array = df_ttest['STFIPS'].sort_values().unique()
state_list = state_array.tolist()
state_list.append('USA')
df_state = pd.DataFrame(columns=['Homeless', 'Housed', 'PercentDifference', 'TStat', 'PValue', 'Significance'])

# Loop through each state
for s in state_list:
    # Define dataframe based on whether state or national
    if s == 'USA':
        dft_temp = df_ttest.copy()
    else:
        dft_temp = df_ttest[df_ttest['STFIPS'] == s]

    # Group by living arrangement and MOUD, then store those variables for later (if they exist)
    # dft_grouped = dft_temp.groupby('LIVARAG')['METHUSE'].mean()
    # dft_grouped = dft_temp.groupby('LIVARAG')['METHUSE'].sum()
    dft_grouped = dft_temp.groupby('LIVARAG')['METHUSE'].count()
    homeless_moud = dft_grouped.loc['Homeless']
    independent_moud = dft_grouped.loc['Housed']

    # Calculate the percent difference (if there is a homeless group in that state)
    if independent_moud > 0:
       percent_difference = (homeless_moud - independent_moud)/independent_moud
    # elif homeless_moud = 0:
    #     percent_difference
    else:
        percent_difference = np.nan

    # Perform a t-test between the two groups
    group_A_values = dft_temp[dft_temp['LIVARAG'] == 'Homeless']['METHUSE']
    group_B_values = dft_temp[dft_temp['LIVARAG'] == 'Housed']['METHUSE']
    t_stat, p_value = ttest_ind(group_A_values, group_B_values)

    # Add significance based on p-value
    if p_value < 0.001:
        significance = '****'
    elif p_value < 0.01:
        significance = '***'
    elif p_value < 0.05:
        significance = '**'
    elif p_value <0.1:
        significance = '*'
    else:
        significance = ''

    # Add values to a dictionary and round
    dict_results = {'Homeless':homeless_moud, 'Housed':independent_moud, 'PercentDifference':percent_difference, 'TStat':t_stat, 'PValue':p_value}
    for key, value in dict_results.items():
        dict_results[key] = round(value, 6)

    # Add significance and to dictionary and then dictionary to dataframe
    dict_results['Significance'] = significance
    df_state.loc[s] = dict_results

# df_state.to_csv('frequencies.csv') #uncomment to save file
df_state

Unnamed: 0,Homeless,Housed,PercentDifference,TStat,PValue,Significance
AK,333,1214,-0.7257,-4.249484,2.3e-05,****
AL,379,5170,-0.926692,-8.004596,0.0,****
AR,203,1741,-0.8834,-7.541382,0.0,****
CA,10568,31443,-0.6639,-61.263606,0.0,****
CO,1861,11203,-0.833884,-0.181478,0.855995,
CT,2867,19592,-0.853665,-17.040277,0.0,****
DC,703,835,-0.158084,1.725765,0.084591,*
DE,887,7907,-0.887821,-9.880705,0.0,****
GA,327,2514,-0.869928,-1.155031,0.248175,
HI,66,179,-0.631285,-0.327188,0.743808,
