# Imports

In [45]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns",None)

In [46]:
import warnings
warnings.filterwarnings("ignore")

In [47]:
df = pd.read_excel('data.xls')

In [48]:
df.rename(columns={
    'Chieucao': 'Height',
    'Cannang': 'Weight',
    'Duongvao': 'Route of Entry',
    'THA': 'Hypertension',
    'DTD': 'Diabetes',
    'Thomay': 'Mechanical Ventilation',
    'Mach': 'Pulse',
    'Nhietdo': 'Temperature',
    'HATB': 'Mean Arterial Pressure',
    'Nhiptho': 'Respiratory Rate',
    'Lactac0': 'Lactate',
    'Ure': 'Urea',
    'Creatinin': 'Creatinine',
    'PCT0': 'Procalcitonin',
    'BiLIrubin': 'Bilirubin',
    'BC0': 'White Blood Cell Count',
    'Kết cục tổn thương thận cấp':'Outcome of acute kidney injury',
    'Điều trị lọc máu':'Dialysis treatment'
}, inplace=True)

In [49]:
df.drop(columns=['Unnamed: 25', 'STT'],inplace=True)

In [50]:
scaling_factors = {
    "Procalcitonin": 1000,          
    "White Blood Cell Count": 10,   
    "Creatinine": 88.4,             
    "Urea": 2.14,                   
    "Bilirubin": 17.1,              
    "Albumin": 10                   
}

for col, factor in scaling_factors.items():
    df[col] = df[col] / factor


In [51]:
different_distribution_features = [
    "SOFA",  
    "APACHEII",  
    "Mean Arterial Pressure",  
    "Lactate",  
    "Creatinine",  
    "Procalcitonin",  
    "Bilirubin",  
    "White Blood Cell Count",  
]

In [52]:
columns_to_impute = ["Respiratory Rate", "Albumin", 'Bilirubin', 'Procalcitonin', 'HCO3']

df[columns_to_impute] = df.groupby(["Gender", "Hypertension", "Outcome of acute kidney injury"])[columns_to_impute].transform(lambda x: x.fillna(x.median()))

In [53]:
df_transformed = df.copy()

log_transform_cols = ['Procalcitonin', 'Creatinine', 'Urea', 'Lactate', 'HCO3', 'Mean Arterial Pressure']
sqrt_transform_cols = ['White Blood Cell Count', 'APACHEII', 'SOFA']

for col in log_transform_cols + sqrt_transform_cols:
    if col in log_transform_cols:
        df_transformed[col] = np.log1p(df[col])
        df[col] = np.log1p(df[col])
        
    if col in sqrt_transform_cols:
        df_transformed[col] = np.sqrt(df[col].clip(lower=0))
        df[col] = np.sqrt(df[col].clip(lower=0))

In [54]:
def remove_outliers(df):
    cols = ['Mechanical Ventilation', 'Procalcitonin', 'Creatinine',
            'Bilirubin', 'White Blood Cell Count']

    df_clean = df.copy()

    for col in cols:
        q1 = df_clean[col].quantile(0.25)
        q3 = df_clean[col].quantile(0.75)
        iqr = q3 - q1

        lower = q1 - 2.5 * iqr
        upper = q3 + 2.5 * iqr
        
        df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    
    return df_clean


In [55]:
df1 = df.iloc[:98, :].copy()
df2 = df.iloc[99:210, :].copy()
df3 = df.iloc[211:411, :].copy()
df4 = df.iloc[412:531, :].copy()

In [56]:
df1 = remove_outliers(df1)
df2 = remove_outliers(df2)
df3 = remove_outliers(df3)
df4 = remove_outliers(df4)

df = pd.concat([df1,df2,df3,df4])

In [57]:
df.to_csv("Cleaned.csv", index=False)