## Load data

In [131]:
import pandas as pd
import ast
import numpy as np

# Load in data
admissions = 'tedsa_puf_2019.csv'
df_raw = pd.read_csv(f'../../Downloads/{admissions}')

## Filter out select rows and columns

In [132]:
# Get count of original number of rows
old_rows = len(df_raw)

# Drop defined columns (year of admission, case id, geographic metro area, geographic division, geographic region)
columns_to_drop = ['ADMYR', 'CASEID', 'CBSA2010', 'DIVISION', 'REGION']
df = df_raw.drop(columns=columns_to_drop)
print(f'Dropped {len(columns_to_drop)} columns ({len(df.columns)} remain)')

# Drop values where dependent variable is unknown
df = df[df['METHUSE'] != -9]

# Only keep patients admitted with self-described use of an opioid as their primary substance use (i.e., SUB1 = 5, 6, or 7)
df = df[df['SUB1'].between(5, 7)]
new_rows = len(df)
percent_change = round(100*(old_rows-new_rows)/old_rows, 1)
print(f'Dropped {"{:,}".format(old_rows-new_rows)} observations or {percent_change}% of the data ({"{:,}".format(new_rows)} rows remain)')

df = df.reset_index(drop='index')

Dropped 5 columns (57 remain)
Dropped 1,340,233 observations or 71.9% of the data (524,134 rows remain)


In [134]:
df

Unnamed: 0,STFIPS,EDUC,MARSTAT,SERVICES,DETCRIM,NOPRIOR,PSOURCE,ARRESTS,EMPLOY,METHUSE,...,STIMFLG,BENZFLG,TRNQFLG,BARBFLG,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,IDU,ALCDRUG
0,2,2,1,7,-9,3,1,0,3,2,...,0,0,0,0,0,0,0,0,1,3
1,2,3,1,7,-9,2,3,0,4,1,...,0,0,0,0,0,0,0,0,1,2
2,2,1,2,7,-9,5,2,0,4,1,...,0,0,0,0,0,0,0,0,1,2
3,2,3,1,7,-9,3,3,0,4,1,...,0,0,0,0,0,0,0,0,1,2
4,2,3,1,7,-9,1,2,0,2,1,...,0,0,0,0,0,0,0,0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524129,56,-9,2,6,-9,0,6,0,3,2,...,0,0,0,0,0,0,0,0,0,2
524130,56,-9,2,6,1,2,7,0,3,2,...,0,0,0,0,0,0,0,0,0,2
524131,56,3,1,7,-9,3,2,0,3,2,...,0,0,0,0,1,0,0,0,0,3
524132,56,1,1,7,3,0,7,0,4,2,...,0,0,0,0,0,0,0,0,0,2


## Make dataset human-readable

In [135]:
# Load in variable dictionary
with open('VariableDictionary.txt') as file:
    variable_dict_string = file.read()
    variable_dict = ast.literal_eval(variable_dict_string)

# Rename entries in column according to dictionary
df2 = df.copy()
for col, col_dict in variable_dict.items():
    if col in df2.columns:
        for old_value, new_value in variable_dict[col].items():
            df2[col] = df2[col].replace(old_value, new_value)

# Rename "-9" values as "Unknown"
for col in df2.columns:
    df2[col] = df2[col].replace(-9, 'Unknown')

In [136]:
# Merge DETNLF (detailed not in labor force) into EMPLOY==4 (not in labor force)
detailed_employ = []

for idx, value in df2.iterrows():
    if value['EMPLOY'] == 'NotInLaborForce':
        if value['DETNLF'] == 'Unknown':
            # Assign 'UnknownNotInLaborForce' if 'NotInLaborForce' and 'Unknown'
            detailed_employ.append('UnknownNotInLaborForce')
        else:
            # Otherwise, assign as the DETNLF value
            detailed_employ.append(value['DETNLF'])
    else:
        # Assign the EMPLOY value if not 'NotInLaborForce'
        detailed_employ.append(value['EMPLOY'])

# Add a new column for detailed employment and drop the two source columns
df2['DETEMPLOY'] = detailed_employ
df2 = df2.drop(columns=['EMPLOY', 'DETNLF'])

In [66]:
df2

Unnamed: 0,STFIPS,EDUC,MARSTAT,SERVICES,DETCRIM,NOPRIOR,PSOURCE,ARRESTS,METHUSE,PSYPROB,...,BARBFLG,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,DIVISION,REGION,IDU,ALCDRUG,DETEMPLOY
0,MA,Grade12OrGED,NowMarried,AmbulatoryNonIntensiveOutpatient,Unknown,0PriorTreatments,OtherHealthCareProvider,0Arrest,0,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NewEngland,Northeast,IDU,Alcohol&Drugs,OtherNotInLaborForce
1,NY,1To3yCollege,Unknown,Detox24hHospitalInpatient,Unknown,Unknown,Individual,Unknown,0,Unknown,...,NotReported,NotReported,NotReported,NotReported,NotReported,MiddleAtlantic,Northeast,NoIDU,Alcohol&Drugs,OtherNotInLaborForce
2,MA,1To3yCollege,NeverMarried,RehabResShortTerm,Unknown,5PlusPriorTreatments,DrugCareProvider,1Arrest,0,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NewEngland,Northeast,IDU,OtherDrugs,OtherNotInLaborForce
3,MI,Grade12OrGED,NeverMarried,AmbulatoryIntensiveOutpatient,ProbationOrParole,0PriorTreatments,CourtReferral,0Arrest,0,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,EastNorthCentral,Midwest,IDU,OtherDrugs,Unemployed
4,PR,Grade12OrGED,NowMarried,AmbulatoryNonIntensiveOutpatient,Unknown,1PriorTreatments,Individual,0Arrest,0,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,U.S. territories,USTerritory,IDU,OtherDrugs,UnknownNotInLaborForce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
423481,CT,4yCollegePlus,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,5PlusPriorTreatments,Individual,0Arrest,1,Yes,...,NotReported,NotReported,NotReported,NotReported,NotReported,NewEngland,Northeast,NoIDU,OtherDrugs,Unemployed
423482,AZ,Unknown,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,1PriorTreatments,Individual,2PlusArrest,0,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,Alcohol&Drugs,OtherNotInLaborForce
423483,NY,1To3yCollege,Unknown,Detox24hFreeStandingRes,Unknown,Unknown,DrugCareProvider,Unknown,0,Unknown,...,NotReported,NotReported,NotReported,NotReported,NotReported,MiddleAtlantic,Northeast,IDU,OtherDrugs,OtherNotInLaborForce
423484,NJ,Grade12OrGED,NowMarried,AmbulatoryIntensiveOutpatient,DiversionaryProgram,5PlusPriorTreatments,CourtReferral,0Arrest,1,Yes,...,NotReported,NotReported,NotReported,NotReported,NotReported,MiddleAtlantic,Northeast,NoIDU,OtherDrugs,FullTime


# Approach 1 results
Calculate the percent difference between the two homeless/non-homeless and statistical difference

In [138]:
dfg = df2.copy()
dfg

Unnamed: 0,STFIPS,EDUC,MARSTAT,SERVICES,DETCRIM,NOPRIOR,PSOURCE,ARRESTS,METHUSE,PSYPROB,...,BENZFLG,TRNQFLG,BARBFLG,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,IDU,ALCDRUG,DETEMPLOY
0,AK,Grade9To11,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,3PriorTreatments,Individual,0Arrest,NoMethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,IDU,AlcoholAndDrugs,Unemployed
1,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,2PriorTreatments,OtherHealthCareProvider,0Arrest,MethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,IDU,OtherDrugs,OtherNotInLaborForce
2,AK,Grade8OrLess,NowMarried,AmbulatoryNonIntensiveOutpatient,Unknown,5PlusPriorTreatments,DrugCareProvider,0Arrest,MethUse,Yes,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,IDU,OtherDrugs,RetiredOrDisabled
3,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,3PriorTreatments,OtherHealthCareProvider,0Arrest,MethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,IDU,OtherDrugs,OtherNotInLaborForce
4,AK,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,1PriorTreatments,DrugCareProvider,0Arrest,MethUse,Yes,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,IDU,OtherDrugs,PartTime
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524129,WY,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,Unknown,0PriorTreatments,OtherReferral,0Arrest,NoMethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NoIDU,OtherDrugs,Unemployed
524130,WY,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,Court,2PriorTreatments,CourtReferral,0Arrest,NoMethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NoIDU,OtherDrugs,Unemployed
524131,WY,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,Unknown,3PriorTreatments,DrugCareProvider,0Arrest,NoMethUse,Yes,...,NotReported,NotReported,NotReported,Reported,NotReported,NotReported,NotReported,NoIDU,AlcoholAndDrugs,Unemployed
524132,WY,Grade8OrLess,NeverMarried,AmbulatoryNonIntensiveOutpatient,ProbationOrParole,0PriorTreatments,CourtReferral,0Arrest,NoMethUse,No,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,NoIDU,OtherDrugs,InstitutionResident


In [265]:
import pandas as pd
from scipy.stats import ttest_ind

# Simplify dataset
df_ttest = dfg[['STFIPS', 'LIVARAG', 'METHUSE']]
df_ttest = df_ttest[df_ttest['LIVARAG'] != 'Unknown']
# df_ttest = df_ttest[df_ttest['LIVARAG'] != 'DependLiving']
df_ttest['LIVARAG'] = df_ttest['LIVARAG'].replace({'DependLiving':'Housed', 'IndependentLiving':'Housed'})
df_ttest['METHUSE'] = df_ttest['METHUSE'].replace({'MethUse':1, 'NoMethUse':0})

# Create loop assets
state_array = df_ttest['STFIPS'].sort_values().unique()
df_state = pd.DataFrame(columns=['Homeless', 'Housed', 'PercentDifference', 'TStat', 'PValue', 'Significance'])

# Loop through each state
for s in state_array:
    dft_temp = df_ttest[df_ttest['STFIPS'] == s]

    # Group by living arrangement and MOUD, then store those variables for later (if they exist)
    dft_grouped = dft_temp.groupby('LIVARAG')['METHUSE'].mean()
    homeless_moud = dft_grouped.loc['Homeless']
    independent_moud = dft_grouped.loc['Housed']

    # Calculate the percent difference (if there is a homeless group in that state)
    if independent_moud > 0:
       percent_difference = (homeless_moud - independent_moud)/independent_moud
    # elif homeless_moud = 0:
    #     percent_difference
    else:
        percent_difference = np.nan

    # Perform a t-test between the two groups
    group_A_values = dft_temp[dft_temp['LIVARAG'] == 'Homeless']['METHUSE']
    group_B_values = dft_temp[dft_temp['LIVARAG'] == 'Housed']['METHUSE']
    t_stat, p_value = ttest_ind(group_A_values, group_B_values)

    # Add significance based on p-value
    if p_value < 0.001:
        significance = '****'
    elif p_value < 0.01:
        significance = '***'
    elif p_value < 0.05:
        significance = '**'
    elif p_value <0.1:
        significance = '*'
    else:
        significance = ''

    # Add values to a dictionary and round
    dict_results = {'Homeless':homeless_moud, 'Housed':independent_moud, 'PercentDifference':percent_difference, 'TStat':t_stat, 'PValue':p_value}
    for key, value in dict_results.items():
        dict_results[key] = round(value, 6)

    # Add significance and to dictionary and then dictionary to dataframe
    dict_results['Significance'] = significance
    df_state.loc[s] = dict_results

df_state.to_csv('ttest.csv')
df_state

Unnamed: 0,Homeless,Housed,PercentDifference,TStat,PValue,Significance
AK,0.387387,0.518122,-0.252324,-4.249484,2.3e-05,****
AL,0.118734,0.312766,-0.620376,-8.004596,0.0,****
AR,0.103448,0.364159,-0.715925,-7.541382,0.0,****
CA,0.344436,0.669338,-0.485408,-61.263606,0.0,****
CO,0.36432,0.366509,-0.005972,-0.181478,0.855995,
CT,0.17684,0.333963,-0.47048,-17.040277,0.0,****
DC,0.079659,0.057485,0.385728,1.725765,0.084591,*
DE,0.104848,0.253067,-0.585691,-9.880705,0.0,****
GA,0.207951,0.236675,-0.121363,-1.155031,0.248175,
HI,0.060606,0.072626,-0.165501,-0.327188,0.743808,
