In [2]:
import pandas as pd
import numpy as np

In [3]:
raw_admissions = pd.read_csv('data/HDHI Admission data.csv')
print(raw_admissions.shape)

(15757, 56)


In [5]:
admissions = raw_admissions.copy()

# Account for repeated visits with Visit number based on MRD No.
admissions.sort_values(by='MRD No.', inplace=True)
admissions['Visit_Number'] = admissions.groupby('MRD No.').cumcount() + 1

# Remove columns that are not needed.
columns_to_discard = ['SNO', 'MRD No.', 'D.O.A', 'D.O.D', 'month year']
admissions.drop(columns=columns_to_discard, inplace=True)

# Fill missing values with 0 and replace occurrences of 'EMPTY' with 0.
admissions.fillna(0, inplace=True)
admissions.replace('EMPTY', 0, inplace=True)

# Remove rows with invalid entires.
admissions = admissions[~admissions['CHEST INFECTION'].astype(str).str.contains(r"\\")]
admissions = admissions[admissions['DURATION OF STAY'] >= admissions['duration of intensive unit stay']]

# Exclude unrepresentative data (Discharge against Medical Adivce)
admissions = admissions[admissions['OUTCOME'] != 'DIMA']

# Remove outliers in target variable.
from scipy.stats import zscore
admissions = admissions[np.abs(zscore(admissions['DURATION OF STAY'])) <= 3]

# Convert specific columns to their appropriate data types.
columns_to_float = ['HB', 'TLC', 'CREATININE', 'BNP', 'PLATELETS', 'GLUCOSE', 'UREA', 'EF']
columns_to_int = ['CHEST INFECTION']
# Ensure columns exist before conversion.
columns_to_float = [col for col in columns_to_float if col in admissions.columns]
columns_to_int = [col for col in columns_to_int if col in admissions.columns]
admissions[columns_to_float] = admissions[columns_to_float].astype(float)
admissions[columns_to_int] = admissions[columns_to_int].astype(int)

# Save the cleaned DataFrame and display the first few rows as a preview.
admissions.to_csv('data/HDHI Admission data_cleaned.csv', index=False)
admissions.head()


Unnamed: 0,AGE,GENDER,RURAL,TYPE OF ADMISSION-EMERGENCY/OPD,DURATION OF STAY,duration of intensive unit stay,OUTCOME,SMOKING,ALCOHOL,DM,...,UTI,NEURO CARDIOGENIC SYNCOPE,ORTHOSTATIC,INFECTIVE ENDOCARDITIS,DVT,CARDIOGENIC SHOCK,SHOCK,PULMONARY EMBOLISM,CHEST INFECTION,Visit_Number
9466,64,F,U,O,10,2,DISCHARGE,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6656,70,M,U,O,1,0,DISCHARGE,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6416,74,F,U,O,3,0,DISCHARGE,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8169,74,F,U,E,3,0,DISCHARGE,0,0,0,...,0,0,0,0,0,0,0,0,1,2
5194,74,F,U,O,2,1,DISCHARGE,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [6]:
admissions = pd.read_csv('data/HDHI Admission data_cleaned.csv')

admissions['GENDER'].replace(['M','F'],[0,1],inplace=True)
admissions['RURAL'].replace(['R','U'],[0,1],inplace=True)
admissions['TYPE OF ADMISSION-EMERGENCY/OPD'].replace(['E','O'],[0,1],inplace=True)
admissions.drop(['OUTCOME', 'duration of intensive unit stay'],axis=1,inplace=True)

admissions.to_csv('data/HDHI Admission data_prepared.csv', index=False)
admissions.head()

Unnamed: 0,AGE,GENDER,RURAL,TYPE OF ADMISSION-EMERGENCY/OPD,DURATION OF STAY,SMOKING,ALCOHOL,DM,HTN,CAD,...,UTI,NEURO CARDIOGENIC SYNCOPE,ORTHOSTATIC,INFECTIVE ENDOCARDITIS,DVT,CARDIOGENIC SHOCK,SHOCK,PULMONARY EMBOLISM,CHEST INFECTION,Visit_Number
0,64,1,1,1,10,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
1,70,0,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,74,1,1,1,3,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,74,1,1,0,3,0,0,0,1,1,...,0,0,0,0,0,0,0,0,1,2
4,74,1,1,1,2,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,3
