# Preprocess the TJH Dataset

## Import packages

In [1]:
import os
from pathlib import Path
import numpy as np
import pandas as pd

data_dir = "./tjh/"
Path(os.path.join(data_dir, 'processed')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(data_dir, 'statistics')).mkdir(parents=True, exist_ok=True)

## Read data from files

In [2]:
df = pd.read_excel(os.path.join(data_dir, 'raw', 'time_series_375_prerpocess_en.xlsx'))

## Preprocess Data

### Rename columns

In [3]:
df = df.rename(columns={"PATIENT_ID": "PatientID", "outcome": "Outcome", "gender": "Sex", "age": "Age", "RE_DATE": "RecordTime", "Admission time": "AdmissionTime", "Discharge time": "DischargeTime"})

### Fill PatientID column

In [4]:
df['PatientID'].fillna(method='ffill', inplace=True)

### Format data values

In [5]:
# gender transformation: 1--male, 0--female
df['Sex'].replace(2, 0, inplace=True)

# only reserve y-m-d precision for `RE_DATE` and `Discharge time` columns
df['RecordTime'] = df['RecordTime'].dt.strftime('%Y-%m-%d')
df['DischargeTime'] = df['DischargeTime'].dt.strftime('%Y-%m-%d')
df['AdmissionTime'] = df['AdmissionTime'].dt.strftime('%Y-%m-%d')

### Exclude patients with missing labels

In [6]:
df = df.dropna(subset = ['PatientID', 'RecordTime', 'DischargeTime'], how='any')

### Calculate the Length-of-Stay (LOS) label

In [7]:
df['LOS'] = (pd.to_datetime(df['DischargeTime']) - pd.to_datetime(df['RecordTime'])).dt.days

# Notice: Set negative LOS values to 0
df['LOS'] = df['LOS'].apply(lambda x: 0 if x < 0 else x)

### Drop columns whose values are all the same or all NaN

In [8]:
# Drop '2019-nCoV nucleic acid detection' column 
df = df.drop(columns=['2019-nCoV nucleic acid detection'])

### Record feature names

In [9]:
basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']
target_features = ['Outcome', 'LOS']
demographic_features = ['Sex', 'Age']
labtest_features = ['Hypersensitive cardiac troponinI', 'hemoglobin', 'Serum chloride', 'Prothrombin time', 'procalcitonin', 'eosinophils(%)', 'Interleukin 2 receptor', 'Alkaline phosphatase', 'albumin', 'basophil(%)', 'Interleukin 10', 'Total bilirubin', 'Platelet count', 'monocytes(%)', 'antithrombin', 'Interleukin 8', 'indirect bilirubin', 'Red blood cell distribution width ', 'neutrophils(%)', 'total protein', 'Quantification of Treponema pallidum antibodies', 'Prothrombin activity', 'HBsAg', 'mean corpuscular volume', 'hematocrit', 'White blood cell count', 'Tumor necrosis factorα', 'mean corpuscular hemoglobin concentration', 'fibrinogen', 'Interleukin 1β', 'Urea', 'lymphocyte count', 'PH value', 'Red blood cell count', 'Eosinophil count', 'Corrected calcium', 'Serum potassium', 'glucose', 'neutrophils count', 'Direct bilirubin', 'Mean platelet volume', 'ferritin', 'RBC distribution width SD', 'Thrombin time', '(%)lymphocyte', 'HCV antibody quantification', 'D-D dimer', 'Total cholesterol', 'aspartate aminotransferase', 'Uric acid', 'HCO3-', 'calcium', 'Amino-terminal brain natriuretic peptide precursor(NT-proBNP)', 'Lactate dehydrogenase', 'platelet large cell ratio ', 'Interleukin 6', 'Fibrin degradation products', 'monocytes count', 'PLT distribution width', 'globulin', 'γ-glutamyl transpeptidase', 'International standard ratio', 'basophil count(#)', 'mean corpuscular hemoglobin ', 'Activation of partial thromboplastin time', 'Hypersensitive c-reactive protein', 'HIV antibody quantification', 'serum sodium', 'thrombocytocrit', 'ESR', 'glutamic-pyruvic transaminase', 'eGFR', 'creatinine']

### Set negative values to NaN

In [10]:
# Set negative values to NaN
df[df[demographic_features + labtest_features] < 0] = np.nan

### Merge by date

In [11]:
# Merge by PatientID and RecordTime
df = df.groupby(['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime'], dropna=True, as_index = False).mean()

### Change the order of columns

In [12]:
df = df[ basic_records + target_features + demographic_features + labtest_features ]

### Export data to files

In [13]:
df.to_csv(os.path.join(data_dir, 'processed', 'tjh_dataset_formatted.csv'), index=False)