In [1]:
# Importing dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

### ***Research based on Chat-GPT data***

### Useful columns from given dataseta that can help in Sepsis prediction:

1. Demographics (person_demographics_episode.csv)
Useful columns:
  1. age_in_months – Sepsis risk varies by age (infants & elderly are more vulnerable).
  2. gender – Some studies suggest gender may influence sepsis outcomes.


2. Lab & Vital Signs (measurement_lab.csv, measurement_meds.csv, measurement_observation.csv)
[**--Highly Relevant for Sepsis Prediction--**]
Useful columns

**Inflammatory Markers-
  1. C reactive protein [Mass/volume] in Serum or Plasma.
  2. Interleukin 6 [Mass/volume] in Body fluid.
  3. Procalcitonin [Mass/volume] in Serum or Plasma.
  4. Organ Dysfunction Indicators.
  5. Creatinine [Mass/volume] in Blood (Kidney function).
  6. Bilirubin.total [Moles/volume] in Serum or Plasma (Liver function).
  7. Lactate [Moles/volume] in Blood (Tissue oxygenation).
  8. Blood Gas & Electrolytes (Shock Indicators).
  9. Base excess in Arterial blood.
  10. Bicarbonate [Moles/volume] in Arterial/Venous blood.
  11. Carbon dioxide [Partial pressure] in Arterial/Venous blood.
  12. Oxygen [Partial pressure] in Arterial/Venous blood.
  13. Blood arterial pH, Blood venous pH.
  14. Coagulation & Blood Counts (Sepsis-related Clotting Issues).
  15. Platelet count.
  16. Prothrombin time (PT).
  17. Partial thromboplastin time, activated.
  18. D-dimer level.
  19. Fibrinogen measurement.
  20. Total white blood count.
  21. Vital Signs (Sepsis Symptoms).
  22. Systolic blood pressure, Diastolic blood pressure (Sepsis often leads to hypotension).
  23. Body temperature (Fever or hypothermia are key symptoms).
  24. Respiratory rate (Sepsis often causes rapid breathing).
  25. Heart rate (Tachycardia is common).
  26. Measurement of oxygen saturation at periphery (Low oxygen saturation is concerning).

### Patients Demographics dataset

In [2]:
# Reading and formatting patients demographics train
patient_demographics = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data/person_demographics_episode_train.csv')

# Necessary columns from patients demographics
patient_demographics = patient_demographics[['person_id', 'age_in_months', 'gender']]

# Label encoding gender column
patient_demographics['gender'] = patient_demographics['gender'].map({'MALE':0, 'FEMALE':1}).astype(np.int64)

# Grouping by 'person_id'
patient_demographics = patient_demographics.groupby('person_id').mean()

# Ressting index
patient_demographics = patient_demographics.reset_index()

# Checing dataset shape
patient_demographics.shape

(2649, 3)

### Lab Measurement dataset

In [3]:
# Reading and formatting patients lab measurements
lab_measure_train = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data/measurement_lab_train.csv')


# Necessary columns from lab measure
necessary_columns = ['person_id',
'C reactive protein [Mass/volume] in Serum or Plasma',
'Interleukin 6 [Mass/volume] in Body fluid',
'Procalcitonin [Mass/volume] in Serum or Plasma',
'Bilirubin.total [Moles/volume] in Serum or Plasma',
'Lactate [Moles/volume] in Blood',
'Creatinine [Mass/volume] in Blood',
'Base excess in Arterial blood by calculation',
'Bicarbonate [Moles/volume] in Arterial blood',
'Carbon dioxide [Partial pressure] in Venous blood',
'Oxygen [Partial pressure] in Venous blood',
'Blood arterial pH', 'Blood venous pH',
'Platelet count', 'Prothrombin time (PT)',
'Partial thromboplastin time',
' activated', 'D-dimer level', 'Fibrinogen measurement',
'Total white blood count'
]

# formatting with important cols
lab_measure_train = lab_measure_train[necessary_columns]

# Imputing null values
lab_measure_train.fillna(lab_measure_train.mean(), inplace=True)

# Groping by 'person_id'
lab_measure_train = lab_measure_train.groupby('person_id').mean()

# Checking dataset shape
lab_measure_train.shape

(2497, 19)

### Merge 1: Merging Patient demographics data with Lab measurement data on the basis of 'person_id' column.

In [4]:
# Merging dataset into a main dataset
main_data = patient_demographics.merge(lab_measure_train, on='person_id', how='inner')

# Checking main dataset shape so far
main_data.shape

(2497, 22)

### Meds Measurement dataset.

In [5]:
# Reading Meds measurement train
meds_measurement = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data/measurement_meds_train.csv')

# Necessary columns from meds measure
necessary_columns = ['person_id', 'Systolic blood pressure', 'Diastolic blood pressure', 'Body temperature','Respiratory rate', 'Heart rate', 'Measurement of oxygen saturation at periphery']

# Dormatting with important cols
meds_measurement = meds_measurement[necessary_columns]

# Dropping null values
meds_measurement = meds_measurement.fillna(meds_measurement.mean())

# Groping by 'person_id'
meds_measurement = meds_measurement.groupby('person_id').mean()

# Checking dataset shape
meds_measurement.shape

(2556, 6)

### Merge 2: Main data with Meds measurement on the basis of 'person_id' column.

In [6]:
# Merging main data with Meds measurement
main_data = main_data.merge(meds_measurement, on='person_id', how='outer')

# Imputing null values
main_data.fillna(main_data.mean(), inplace=True)

### Sepsis label Datase.

In [7]:
# Reading and formatting Sepsis label data
sepsis_label = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/training_data/SepsisLabel_train.csv')
sepsis_label.head()

Unnamed: 0,person_id,measurement_datetime,SepsisLabel
0,274096387,2024-12-03 20:00:00,0
1,1719359031,2024-04-20 09:00:00,0
2,2024544816,2021-07-14 07:00:00,0
3,213710896,2022-05-24 07:00:00,0
4,1335786468,2024-08-25 22:00:00,0


### Merge 3: Merging Main data with Sepsis label data on the basis of 'person_id' column.

In [8]:
# Merging main data formed so far with sepsis label data
main_data = main_data.merge(sepsis_label, on='person_id', how='inner')

# Converting 'measurement_datetime' columns to date-time datatype
main_data['measurement_datetime'] = pd.to_datetime(main_data['measurement_datetime'])

# Using forward fill to impute missing date values
main_data['measurement_datetime'] = main_data['measurement_datetime'].ffill()

# Dropping unnecessary columns
main_data = main_data.drop(['person_id', 'measurement_datetime'], axis=1)

# Resetting Index
main_data = main_data.reset_index(drop=True)

# Checking data distribution in label column
main_data['SepsisLabel'].value_counts()

SepsisLabel
0    324760
1      6874
Name: count, dtype: int64

### The dataset is highly imbalanced. There is way more '0' category than '1'. Have to use resampler.

### Splitting data into dependent and independent variables.

In [9]:
X = main_data.drop('SepsisLabel', axis=1)  # Independent
y = main_data['SepsisLabel']               # Dependent

# Checking shapes
print(X.shape)
print(y.shape)

(331634, 27)
(331634,)


### Resampling and Train test split.

In [10]:
# Define undersampler
undersampler = RandomUnderSampler(random_state=42)

# Apply undersampling
X_balanced, y_balanced = undersampler.fit_resample(X, y)

In [11]:
# X = Features, y = Target variable
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

In [12]:
# Checking distribution after resampling
y_train.value_counts()

SepsisLabel
1    5520
0    5478
Name: count, dtype: int64

Scaling data

In [13]:
# Creating instance of Standard Scaler
scaler = StandardScaler()

# Fitting data (X_train)
scaler.fit(X_train)

# Scaling X_train and X_test
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Predictive modelling.

In [14]:
# Creating model instance
rf_model = RandomForestClassifier(class_weight='balanced')

# Training ML algorithm
rf_model.fit(X_train_scaled, y_train)

# Making predictions
rf_pred = rf_model.predict(X_test_scaled)

### Scores

In [15]:
# Accuracy score
rf_score = accuracy_score(y_test, rf_pred)
print(f'Accuracy score: {rf_score}')
print()

# Precision score
rf_precision = precision_score(y_test, rf_pred)
print(f'Precision score: {rf_precision}')
print()

# Recall Score
rf_recall = recall_score(y_test, rf_pred)
print(f'Recall score: {rf_recall}')
print()

# F1 score
rf_f1 = f1_score(y_test, rf_pred)
print(f'F1 Score: {rf_f1}')

Accuracy score: 0.9156363636363636

Precision score: 0.8800813008130082

Recall score: 0.9593796159527327

F1 Score: 0.9180212014134275


# Submission

### Reading test data

In [16]:
# Patient Demographics data
patient_demographics_test = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/person_demographics_episode_test.csv')

# Lab Measurement data
lab_measure_test = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/measurement_lab_test.csv')

# Meds maesurement data
meds_measurement_test = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/measurement_meds_test.csv')

# Sepsis label data
sepsis_label_test = pd.read_csv('/kaggle/input/phems-hackathon-early-sepsis-prediction/testing_data/SepsisLabel_test.csv')

# Formatting the datasets

Formatting 'patient_demographics_test' dataset

In [17]:
# Necessary columns from patients demographics
patient_demographics_test = patient_demographics_test[['person_id', 'age_in_months', 'gender']]

# Label encoding gender column
patient_demographics_test['gender'] = patient_demographics_test['gender'].map({'MALE':0, 'FEMALE':1}).astype(np.int64)

# Grouping by 'person_id'
patient_demographics_test = patient_demographics_test.groupby('person_id').mean()

# Ressting index
patient_demographics_test = patient_demographics_test.reset_index()

Formatting 'lab_measure_test' dataset

In [18]:
# Necessary columns from lab measure
necessary_columns = ['person_id',
'C reactive protein [Mass/volume] in Serum or Plasma',
'Interleukin 6 [Mass/volume] in Body fluid',
'Procalcitonin [Mass/volume] in Serum or Plasma',
'Bilirubin.total [Moles/volume] in Serum or Plasma',
'Lactate [Moles/volume] in Blood',
'Creatinine [Mass/volume] in Blood',
'Base excess in Arterial blood by calculation',
'Bicarbonate [Moles/volume] in Arterial blood',
'Carbon dioxide [Partial pressure] in Venous blood',
'Oxygen [Partial pressure] in Venous blood',
'Blood arterial pH', 'Blood venous pH',
'Platelet count', 'Prothrombin time (PT)',
'Partial thromboplastin time',
' activated', 'D-dimer level', 'Fibrinogen measurement',
'Total white blood count'
]

# formatting with important cols
lab_measure_test = lab_measure_test[necessary_columns]

# Imputing null values
lab_measure_test.fillna(lab_measure_test.mean(), inplace=True)

# Groping by 'person_id'
lab_measure_test = lab_measure_test.groupby('person_id').mean()

Merge 1: 'patient_demographics_test' and 'lab_measure_test'.

In [19]:
# Merging main data with Meds measurement
main_test_data = patient_demographics_test.merge(lab_measure_test, on='person_id', how='outer')

Formatting 'meds_measurement_test' dataset.

In [20]:
# Necessary columns from meds measure
necessary_columns = ['person_id', 'Systolic blood pressure', 'Diastolic blood pressure', 'Body temperature','Respiratory rate', 'Heart rate', 'Measurement of oxygen saturation at periphery']

# Dormatting with important cols
meds_measurement_test = meds_measurement_test[necessary_columns]

# Dropping null values
meds_measurement_test = meds_measurement_test.dropna()

# Groping by 'person_id'
meds_measurement_test = meds_measurement_test.groupby('person_id').mean()

Merge 2: Merging 'main_test_data' with 'meds_measurement_test'.

In [21]:
# Merging main data with Meds measurement
main_test_data = main_test_data.merge(meds_measurement_test, on='person_id', how='outer')

Formatting 'sepsis_label_test' dataset.

In [22]:
sepsis_label_test['measurement_datetime'] = pd.to_datetime(sepsis_label_test['measurement_datetime'])

Merge 3: Merging 'main_test_data' with 'sepsis_label_test'.

In [23]:
# Merging main data formed so far with sepsis label data
main_test_data = main_test_data.merge(sepsis_label_test, on='person_id', how='inner')

# Imputing null values
main_test_data.fillna(main_test_data.mean(), inplace=True)

# Checking shape
main_test_data.shape

(130483, 29)

In [24]:
# Creating a copy of the dataset
main_test_data_copy = main_test_data.copy()

In [25]:
# Dropping unnecessary columns
main_test_data_copy = main_test_data_copy.drop(['person_id', 'measurement_datetime'], axis=1)

# Imputing null values
main_test_data_copy = main_test_data_copy.fillna(main_test_data_copy.mean())

# Resetting Index
main_test_data_copy = main_test_data_copy.reset_index(drop=True)

In [26]:
# Scaling test data
main_test_data_copy_scaled = scaler.transform(main_test_data_copy)

# Making prediction
test_data_pred = rf_model.predict(main_test_data_copy_scaled)

In [27]:
# Creating submission file from test data
submission_file = sepsis_label_test

# A column with concatenation of 'person id' and 'datetime' columns  
submission_file['person_id_datetime'] = submission_file['person_id'].astype(str) + '_' + submission_file['measurement_datetime'].astype(str)


# Dropping 'person_id' and 'measurement_datetime'
submission_file = submission_file.drop(['person_id', 'measurement_datetime'], axis=1)

# Adding predictions column
submission_file['SepsisLabel'] = test_data_pred

# Checking how submission file looks
submission_file.head()

Unnamed: 0,person_id_datetime,SepsisLabel
0,1416048048_2021-03-25 10:00:00,0
1,280531880_2024-01-22 18:00:00,0
2,1127023302_2023-12-29 21:00:00,0
3,2065909112_2021-07-07 05:00:00,0
4,264445818_2024-08-23 22:00:00,0


In [28]:
# Writing submission file
submission_file.to_csv("submission.csv", index=False)