## Load data

In [56]:
import pandas as pd
import ast
import numpy as np

# Load in data
admissions = 'tedsa_puf_2019.csv'
df = pd.read_csv(f'../../Downloads/{admissions}')

## Compare SUB1 groups with DSMCRIT

In [57]:
sub1 = df['SUB1']
for i in [1,2,3,4,8,9,10,11,12,13,14,15,16,17,18,19]:
    sub1 = sub1.replace(i, 'Other')
# sub1 = sub1.replace(-9, np.nan)

dsmcrit = df['DSMCRIT']
for i in [1,2,3,4,6,7,8,9,10,11,13,14,15,16,17,18,19]:
    dsmcrit = dsmcrit.replace(i, 'Other')
# dsmcrit = dsmcrit.replace(-9, np.nan)

In [62]:
df4 = pd.DataFrame()
df4['SUB1'] = sub1
df4['DSMCRIT'] = dsmcrit

df4.value_counts()

print('See "SUB1 DSMCRIT overlap.xlsx" file for a better breakdown')

See "SUB1 DSMCRIT overlap.xlsx" file for a better breakdown


## Filter out select rows and columns

In [1]:


# Drop defined columns
columns_to_drop = ['ADMYR', 'CASEID', 'DETNLF', 'DETCRIM']
df = df.drop(columns=columns_to_drop)
print(f'Dropped {len(columns_to_drop)} columns ({len(df.columns)} remain)')

# Drop Puerto Rico
df = df[df['STFIPS'] != 72]

# Only keep patients admitted with self-described use of an opioid as their primary substance use
old_rows = len(df)
df = df[df['SUB1'].between(5, 7)]
new_rows = len(df)
percent_change = round((old_rows-new_rows)/old_rows, 1)*100
print(f'Dropped {old_rows-new_rows} observations or {percent_change}% of the data {new_rows} rows remain)')

df = df.reset_index(drop='index')

Dropped 4 columns (58 remain)
Dropped 1296849 observations or 70.0% of the data 564425 rows remain)


In [2]:
df

Unnamed: 0,STFIPS,CBSA2010,EDUC,MARSTAT,SERVICES,NOPRIOR,PSOURCE,ARRESTS,EMPLOY,METHUSE,...,TRNQFLG,BARBFLG,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,DIVISION,REGION,IDU,ALCDRUG
0,2,-9,2,1,7,3,1,0,3,2,...,0,0,0,0,0,0,9,4,1,3
1,2,-9,3,1,7,2,3,0,4,1,...,0,0,0,0,0,0,9,4,1,2
2,2,-9,1,2,7,5,2,0,4,1,...,0,0,0,0,0,0,9,4,1,2
3,2,-9,3,1,7,3,3,0,4,1,...,0,0,0,0,0,0,9,4,1,2
4,2,-9,3,1,7,1,2,0,2,1,...,0,0,0,0,0,0,9,4,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564420,56,-9,-9,2,6,0,6,0,3,2,...,0,0,0,0,0,0,8,4,0,2
564421,56,-9,-9,2,6,2,7,0,3,2,...,0,0,0,0,0,0,8,4,0,2
564422,56,-9,3,1,7,3,2,0,3,2,...,0,0,1,0,0,0,8,4,0,3
564423,56,-9,1,1,7,0,7,0,4,2,...,0,0,0,0,0,0,8,4,0,2


## Make dataset human-readable

In [3]:
# Load in variable dictionary
with open('variable_dictionary.txt') as file:
    variable_dict_string = file.read()
    variable_dict = ast.literal_eval(variable_dict_string)

# Rename entries in column according to dictionary
df2 = df.copy()
for col, col_dict in variable_dict.items():
    for old_value, new_value in variable_dict[col].items():
        df2[col] = df2[col].replace(old_value, new_value)

# Rename "-9" values as "Unknown"
for col in df2.columns:
    df2[col] = df2[col].replace(-9, 'Unknown')

In [4]:
df2

Unnamed: 0,STFIPS,CBSA2010,EDUC,MARSTAT,SERVICES,NOPRIOR,PSOURCE,ARRESTS,EMPLOY,METHUSE,...,TRNQFLG,BARBFLG,SEDHPFLG,INHFLG,OTCFLG,OTHERFLG,DIVISION,REGION,IDU,ALCDRUG
0,StateAK,Unknown,Grade9To11,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,Individual,0Arrest,Unemployed,NoMethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,Alcohol&Drugs
1,StateAK,Unknown,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,2PriorTreatments,OtherHealthCareProvider,0Arrest,NotInLaborForce,MethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs
2,StateAK,Unknown,GradeOrLess,NowMarried,AmbulatoryNonIntensiveOutpatient,5PlusPriorTreatments,DrugCareProvider,0Arrest,NotInLaborForce,MethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs
3,StateAK,Unknown,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,OtherHealthCareProvider,0Arrest,NotInLaborForce,MethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs
4,StateAK,Unknown,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,1PriorTreatments,DrugCareProvider,0Arrest,PartTime,MethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Pacific,West,IDU,OtherDrugs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564420,StateWY,Unknown,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,0PriorTreatments,OtherReferral,0Arrest,Unemployed,NoMethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs
564421,StateWY,Unknown,Unknown,NowMarried,AmbulatoryIntensiveOutpatient,2PriorTreatments,CourtReferral,0Arrest,Unemployed,NoMethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs
564422,StateWY,Unknown,Grade12OrGED,NeverMarried,AmbulatoryNonIntensiveOutpatient,3PriorTreatments,DrugCareProvider,0Arrest,Unemployed,NoMethUse,...,NotReported,NotReported,Reported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,Alcohol&Drugs
564423,StateWY,Unknown,GradeOrLess,NeverMarried,AmbulatoryNonIntensiveOutpatient,0PriorTreatments,CourtReferral,0Arrest,NotInLaborForce,NoMethUse,...,NotReported,NotReported,NotReported,NotReported,NotReported,NotReported,Mountain,West,NoIDU,OtherDrugs


## Compare

In [5]:
# Compare SUB1-567 with DSMMCRIT

In [10]:
df3 = df2.copy()

pd.get_dummies(df3)

Unnamed: 0,STFIPS_StateAK,STFIPS_StateAL,STFIPS_StateAR,STFIPS_StateAZ,STFIPS_StateCA,STFIPS_StateCO,STFIPS_StateCT,STFIPS_StateDC,STFIPS_StateDE,STFIPS_StateFL,...,DIVISION_WestNorthCentral,DIVISION_WestSouthCentral,REGION_Midwest,REGION_Northeast,REGION_South,REGION_West,IDU_IDU,IDU_NoIDU,ALCDRUG_Alcohol&Drugs,ALCDRUG_OtherDrugs
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564420,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
564421,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
564422,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,0
564423,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
