In [1]:
import sys
import pandas as pd

In [2]:
home_dir = '/Users/trungpq/Workspace/clabsi/'
sys.path.append(home_dir)

In [3]:
from clabsi.utils import lookup

## Grouping guildline files

These files include unique names of `LineType` or `TherapyType` columns, after string-processing with the standard string processing below. The purpose is to group various values of these columns into a few categories.

In [4]:
linetype_group_file = home_dir + 'data_processing/data_processing_guideline/CLABSI-grouping-LineType.csv'
therapytype_group_file = home_dir + 'data_processing/data_processing_guideline/CLABSI-grouping-TherapyType.csv'

## HjImczH population data

In [5]:
pop_fname = 'HjImczH_all_patients'

In [6]:
col_names = ['Age','Gender','Ethinicity','Race','State','PrimaryInsurer','TherapyType','LineType','LineType2']
df = pd.read_csv(
    home_dir + "raw_data/FY_2022_data_from_HjImczH_all_patients_edited-Inf_Detail_FY22-Table 1.csv",
    names= col_names
)
for col in col_names[1:]:
    # This is the standard string processing for this project
    df[col] = df[col].str.lower().str.strip().str.replace(",", "/").str.replace(",", "/").str.replace(";", "/").str.replace("\n", "\\")
df

Unnamed: 0,Age,Gender,Ethinicity,Race,State,PrimaryInsurer,TherapyType,LineType,LineType2
1,49,female,unknown,unknown,md,contract - medicare denial,miscellaneous,hohn double lumen,central catheter (non-tunneled)
2,76,female,unknown,white,md,mc,antibiotics,hohn,central catheter (non-tunneled)
3,26,male,unknown,white,md,blue cross,total parenteral nutrition,unknown,central catheter (non-tunneled)
4,18,male,unknown,black or african-american,md,pp,catheter care,bard picc,central catheter (non-tunneled)
5,69,female,african american,black or african-american,md,mc,total parenteral nutrition,bard picc,central catheter (non-tunneled)
...,...,...,...,...,...,...,...,...,...
3151,75,male,unknown,white,md,contract - medicare denial,chemotherapy,port-a-cath,port-chest
3152,52,female,unknown,unknown,md,bcbs ghmsi,chemotherapy,port-a-cath,port-chest
3153,45,male,unknown,unknown,md,hospice,pain management,port-a-cath,port-chest
3154,63,female,african american,black or african-american,md,family (healthcare),steroid therapy,peripheral cath/ protective,port-peripheral


In [7]:
# Please remove all patients for whom the only method of infusion is “subcutaneous” or “peripheral.”
to_remove = df.LineType.str.lower().str.contains("peripheral")
df = df.loc[~to_remove, :].reset_index(drop=True)

In [8]:
# grouping
linetype_group = pd.read_csv(linetype_group_file)
therapytype_group = pd.read_csv(therapytype_group_file)
df["LineType"] = lookup(df, linetype_group, "LineType", "LineTypeRaw")
df["TherapyType"] = lookup(df, therapytype_group, "TherapyType", "TherapyTypeRaw")
# make sure that there is no value that is not group into something
assert df["LineType"].isna().sum() == 0
assert df["TherapyType"].isna().sum() == 0

In [9]:
# remove row with TherapyType=Exclude or LineType=Exclude
to_remove = (df["LineType"] == "Exclude") | (df["TherapyType"] == "Exclude") | (df["LineType"] == "Pheresis")
df = df.loc[~to_remove, :].reset_index(drop=True)

In [10]:
# One hot encode "LineType" and "TherapyType"
df = pd.concat([df, pd.get_dummies(df["LineType"])], axis=1)
df = pd.concat([df, pd.get_dummies(df["TherapyType"])], axis=1)

In [11]:
# One hot encode "Age"
df["Peds"] = (df["Age"] <= 18).astype(int)

In [12]:
# Assign negative outcome to all the rows
df["NHSN_CLABSI"] = 0

In [13]:
# Assign site name
df["SiteNameMasked"] = 'HjImczH'

In [14]:
# Assign proxy EPIC and CRISP
df["EPIC"] = 1
df["CRISP"] = 1

In [15]:
# output: (W, X, C)
output_cols = ['SiteNameMasked', 'EPIC', 'CRISP', 'NHSN_CLABSI', 'Peds', 'Chemotherapy', 'OPAT', 'TPN', 'OtherTherapy', 'PICC', 'Port', 'TunneledCVC']
df = df.loc[:, output_cols]
df.to_csv(home_dir + f"processed_data/clean_{pop_fname}.csv", index=False)
df2 = df

## Positive clabsi data

### Processing `PatientData_20230315`

In [16]:
pat_fname = 'PatientData_20230315'

In [17]:
df = pd.read_table(
    home_dir + f"raw_data/{pat_fname}.tsv", sep="\t"
    # "~/Workspace/clabsi/clabsi/data/PatientData_20231120.csv", sep=",", encoding='unicode_escape'
)
df["EPIC"] = df["Has access to data due to accessing the same EHR platform; that is, EPIC"]
df["CRISP"] = df["Has access to data due to being in the same state (MD or regional info network)"]
col_names = ['SiteNameMasked', 'Age', 'PrimaryInsurer', 'LineType', 'TherapyType', 'EPIC', 'CRISP', 'NHSN_CLABSI']
df = df.loc[:, col_names]
for col in col_names[2:-1]:
    # This is the standard string processing for this project
    df[col] = df[col].str.lower().str.strip().str.replace(",", "/").str.replace(",", "/").str.replace(";", "/").str.replace("\n", "\\")
df

Unnamed: 0,SiteNameMasked,Age,PrimaryInsurer,LineType,TherapyType,EPIC,CRISP,NHSN_CLABSI
0,AsMINto,66,private,port,blood draw/access only,y,n,1
1,AsMINto,72,va/tricare,picc,blood draw/access only,y,n,1
2,AsMINto,58,private,picc,tpn,y,n,1
3,AsMINto,52,private,picc,blood draw/access only,y,n,1
4,AsMINto,60,private,picc,blood draw/access only,y,n,1
...,...,...,...,...,...,...,...,...
647,RKBWBAB,2,private,tunneled,tpn,n,n,1
648,RKBWBAB,51,private,port,opat,n,n,0
649,RKBWBAB,34,medicare,implanted port,hyd,n,n,1
650,RKBWBAB,5,medicare,tunneled,tpn,n,n,1


In [18]:
# grouping
linetype_group = pd.read_csv(linetype_group_file)
therapytype_group = pd.read_csv(therapytype_group_file)
df["LineType"] = lookup(df, linetype_group, "LineType", "LineTypeRaw")
df["TherapyType"] = lookup(df, therapytype_group, "TherapyType", "TherapyTypeRaw")
# make sure that there is no value that is not group into something
assert df["LineType"].isna().sum() == 0
assert df["TherapyType"].isna().sum() == 0

In [19]:
# remove row with TherapyType=Exclude or LineType=Exclude
to_remove = (df["LineType"] == "Exclude") | (df["TherapyType"] == "Exclude")
df = df.loc[~to_remove, :].reset_index(drop=True)

In [20]:
# One hot encode "LineType" and "TherapyType"
df = pd.concat([df, pd.get_dummies(df["LineType"])], axis=1)
df = pd.concat([df, pd.get_dummies(df["TherapyType"])], axis=1)

In [21]:
# One hot encode "Age"
df["Peds"] = (df["Age"] <= 18).astype(int)

In [22]:
# One hot encode "EPIC"
df["EPIC"] = (df["EPIC"] == 'y').astype(int)
# One hot encode "CRISP"
df["CRISP"] = (df["CRISP"] == 'y').astype(int)

In [23]:
# output: (W, X, C)
output_cols = ['SiteNameMasked', 'EPIC', 'CRISP', 'NHSN_CLABSI', 'Peds', 'Chemotherapy', 'OPAT', 'TPN', 'OtherTherapy', 'PICC', 'Port', 'TunneledCVC']
df = df.loc[:, output_cols]
df.to_csv(home_dir + f"processed_data/clean_{pat_fname}.csv", index=False)

## Group population data from JHHCG with positive clabsi data from JHHCG

In [24]:
# select positive clabsi from JHHCG
df = df.loc[df['SiteNameMasked'] == 'HjImczH']
# group with population data
df = pd.concat([df, df2], axis=0)
# output
df.to_csv(home_dir + f"processed_data/clean_{pop_fname}_{pat_fname}.csv", index=False)