In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('final-data/final-set-a.parquet')
df = df.fillna(0)
X = df.groupby("RecordID").last(numeric_only=True).reset_index()
X = X.drop(columns=["RecordID"])
X = X[sorted(X.columns)]
y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()
print(len(X))
print(len(y))

4000
4000


In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split into train and test sets for an unbiased assessment of performance
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Initialize and fit a Random Forest model
rf = RandomForestClassifier(n_estimators=100, 
                            random_state=42,
                            n_jobs=-1)  # use all cores for speed
rf.fit(X_train, y_train)

In [13]:
importances = rf.feature_importances_

# Combine with feature names into a DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort by descending importance
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

print(feature_importance_df)

        feature  importance
29    Platelets    0.040230
16           HR    0.036983
39       Weight    0.036424
38          WBC    0.034834
23    NIDiasABP    0.034720
15          HCT    0.034715
13      Glucose    0.034642
20          MAP    0.033584
5           BUN    0.033571
3           Age    0.032960
37        Urine    0.031984
25     NISysABP    0.031341
24        NIMAP    0.030907
26           Na    0.030878
28         PaO2    0.030827
33         Temp    0.029533
9       DiasABP    0.029026
18            K    0.028736
8    Creatinine    0.028687
22           Mg    0.027716
14         HCO3    0.027456
19      Lactate    0.026737
32       SysABP    0.025746
40           pH    0.024497
17       Height    0.023698
27        PaCO2    0.022682
4       Albumin    0.020216
0           ALP    0.020114
1           ALT    0.019382
2           AST    0.017808
30     RespRate    0.017606
10         FiO2    0.017224
36    TroponinT    0.015859
6     Bilirubin    0.015792
31         SaO2    0

## With tfresh 

In [5]:
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
import pandas as pd

In [18]:
extraction_settings = MinimalFCParameters()

# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('final-data/final-set-a.parquet')
df_test = pd.read_parquet('final-data/final-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)

X = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=extraction_settings, impute_function=impute)
X= X[sorted(X.columns)]
y = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()


(196000, 42)
(196000, 42)


Feature Extraction: 100%|██████████| 50/50 [00:35<00:00,  1.42it/s]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Initialize and fit a Random Forest model
rf = RandomForestClassifier(n_estimators=100, 
                            random_state=42,
                            n_jobs=-1)  # use all cores for speed
rf.fit(X_train, y_train)

In [20]:
importances = rf.feature_importances_

# Combine with feature names into a DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort by descending importance
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

print(feature_importance_df)

                             feature  importance
367        Urine__standard_deviation    0.006292
369                  Urine__variance    0.006006
299              Platelets__variance    0.005845
163                         HR__mean    0.005720
228                   Mg__sum_values    0.005586
88            Creatinine__sum_values    0.005562
186              K__root_mean_square    0.005506
223                         Mg__mean    0.005358
290      Platelets__absolute_maximum    0.005357
284                     PaO2__median    0.005347
134                  Glucose__median    0.005202
297    Platelets__standard_deviation    0.005126
56             BUN__root_mean_square    0.005108
360          Urine__absolute_maximum    0.005052
188                    K__sum_values    0.005044
266             Na__root_mean_square    0.004845
339                   Temp__variance    0.004832
239              NIDiasABP__variance    0.004789
138              Glucose__sum_values    0.004745
267           Na__st

In [21]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Now print the DataFrame in full
print(feature_importance_df)

                             feature  importance
367        Urine__standard_deviation    0.006292
369                  Urine__variance    0.006006
299              Platelets__variance    0.005845
163                         HR__mean    0.005720
228                   Mg__sum_values    0.005586
88            Creatinine__sum_values    0.005562
186              K__root_mean_square    0.005506
223                         Mg__mean    0.005358
290      Platelets__absolute_maximum    0.005357
284                     PaO2__median    0.005347
134                  Glucose__median    0.005202
297    Platelets__standard_deviation    0.005126
56             BUN__root_mean_square    0.005108
360          Urine__absolute_maximum    0.005052
188                    K__sum_values    0.005044
266             Na__root_mean_square    0.004845
339                   Temp__variance    0.004832
239              NIDiasABP__variance    0.004789
138              Glucose__sum_values    0.004745
267           Na__st

In [22]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # probability of class=1

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")

Accuracy: 0.861
AUC: 0.490


## New try : Accuracy stays the same as before, but AUC gets slightly better

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
import pandas as pd

### My idea here: 
1. Ask Chatgpt what values are good for each metric
2. get the statistics that indicates the range
3. For static variables, don't add any
- Even for low is bad kind of substances => Variance are more important features than minimum 

In [4]:


# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('final-data/final-set-a.parquet')
df_test = pd.read_parquet('final-data/final-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
custom_fc_parameters = {
    "sum_values": None,
    "mean": None,
    "standard_deviation": None,
    "variance": None,
    "root_mean_square": None,
    "absolute_maximum": None,
    "maximum": None,
    "minimum": None,
    "median": None
}
# kind_to_fc_parameters = {
#   "Albumin": "too low bad",
#   "ALP": "too high bad",
#   "ALT": "too high bad",
#   "AST": "too high bad",
#   "Bilirubin": "too high bad",
#   "BUN": "normal",
#   "Cholesterol": "too high bad",
#   "Creatinine": "too high bad",
#   "DiasABP": "normal",
#   "FiO2": "too high bad",
#   "GCS": "too low bad",
#   "Glucose": "normal",
#   "HCO3": "normal",
#   "HCT": "normal",
#   "HR": "normal",
#   "K": "normal",
#   "Lactate": "too high bad",
#   "Mg": "normal",
#   "MAP": "normal",
#   "MechVent": "too high bad",
#   "Na": "normal",
#   "NIDiasABP": "normal",
#   "NIMAP": "normal",
#   "NISysABP": "normal",
#   "PaCO2": "normal",
#   "PaO2": "normal",
#   "pH": "normal",
#   "Platelets": "normal",
#   "RespRate": "normal",
#   "SaO2": "too low bad",
#   "SysABP": "normal",
#   "Temp": "normal",
#   "TropI": "too high bad",
#   "TropT": "too high bad",
#   "Urine": "normal",
#   "WBC": "normal",
kind_to_fc_parameters = {
  "Weight": {"mean":None},
    "Age":{"maximum":None},
    "Gender":{"maximum":None},
    "Height":{"maximum":None},
    "MechVent":{"maximum":None}
    
}
X = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=custom_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, impute_function=impute)
X= X[sorted(X.columns)]
y = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()


(196000, 42)
(196000, 42)


Feature Extraction: 100%|██████████| 50/50 [00:32<00:00,  1.52it/s]


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Initialize and fit a Random Forest model
rf = RandomForestClassifier(n_estimators=100, 
                            random_state=42,
                            n_jobs=-1)  # use all cores for speed
rf.fit(X_train, y_train)

In [6]:
importances = rf.feature_importances_

# Combine with feature names into a DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort by descending importance
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Now print the DataFrame in full
print(feature_importance_df)

                             feature  importance
299                Urine__sum_values    0.006210
168                         Mg__mean    0.006146
237              Platelets__variance    0.006095
292          Urine__absolute_maximum    0.006090
89                  FiO2__sum_values    0.006088
210                     Na__variance    0.006062
235    Platelets__standard_deviation    0.006057
208           Na__standard_deviation    0.005821
308                  WBC__sum_values    0.005775
270           Temp__root_mean_square    0.005677
229      Platelets__absolute_maximum    0.005673
173                   Mg__sum_values    0.005655
135                   HR__sum_values    0.005563
39                         BUN__mean    0.005529
297          Urine__root_mean_square    0.005451
27                      Age__maximum    0.005434
227                 PaO2__sum_values    0.005415
298        Urine__standard_deviation    0.005406
122                      HCT__median    0.005356
103                 

In [None]:
#calculations group by your RecordID

In [7]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # probability of class=1

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")

Accuracy: 0.861
AUC: 0.509


## Another try considering normal distribution - mean and std variance matters a lot there 
Just using mean and variance - already 0.861 accuracy
Based on their distribution : For variables that doesn't have normal distribution put in other values 
For values where variance is important - put also ranges like maximum, minimimum
sum_values are also important for some variables 

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
import pandas as pd
# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('final-data/final-set-a.parquet')
df_test = pd.read_parquet('final-data/final-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
# For most param - distribution is normal distribution 
custom_fc_parameters = {
    "mean": None,
    "standard_deviation": None,
}

# things that are not : AST, ALT,Bilirubin- spike at lower values, high is bad 
# GCS means alert - low is unconscious 
# MechVent 1 is bad - on ventilator 
kind_to_fc_parameters = {
    "Weight": {"mean":None},
    "Age":{"maximum":None},
    "Gender":{"maximum":None},
    "Height":{"maximum":None},
    "MechVent":{"maximum":None},
    "ALT":{"maximum":None},
    "AST":{"maximum":None},
    "Bilirubin":{"maximum":None},
    "GCS":{"minimum":None},
    "Lactate":{"maximum":None},
    "Pa02":{
        "mean": None,
        "variance": None,
    },
    "PaCO2":{
        "mean": None,
        "variance": None,
    },
    "FiO2":{
        "mean": None,
        "variance": None,
    },
    "Glucose":{
        "sum_values":None,
        "mean": None,
        "standard_deviation": None,
    },
    "Platelets":{
        "mean": None,
        "variance": None,
        "maximum":None
    },
    "SaO2":{"minimum":None},
    "TroponinI" :{"maximum":None},
    "TroponinT": {"maximum":None},
    "Urine":{
        "mean": None,
        "standard_deviation": None,
        "sum_values":None
    },
    "RespRate":{
        "variance":None
    }, 
    "HR":{
        "variance":None,
        "maximum":None
    }, "BP":{
    "variance":None,
        "maximum":None
    }
}
X = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=custom_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, impute_function=impute)
X= X[sorted(X.columns)]
y = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()


FileNotFoundError: [Errno 2] No such file or directory: 'ML4H_project1/processed-data/processed-set-c.parquet'

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Initialize and fit a Random Forest model
rf = RandomForestClassifier(n_estimators=100, 
                            random_state=42,
                            n_jobs=-1)  # use all cores for speed
rf.fit(X_train, y_train)

In [5]:
importances = rf.feature_importances_

# Combine with feature names into a DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort by descending importance
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Now print the DataFrame in full
print(feature_importance_df)

                            feature  importance
62        Urine__standard_deviation    0.022571
36                         Mg__mean    0.022036
31            K__standard_deviation    0.020942
44                         Na__mean    0.020687
45           Na__standard_deviation    0.020637
30                          K__mean    0.020599
25                        HCT__mean    0.020163
28                     HR__variance    0.019868
7                         BUN__mean    0.019816
52              Platelets__variance    0.019707
13   Creatinine__standard_deviation    0.019706
12                 Creatinine__mean    0.019635
22              Glucose__sum_values    0.019624
26          HCT__standard_deviation    0.019453
57                       Temp__mean    0.019018
64                        WBC__mean    0.018700
8           BUN__standard_deviation    0.018691
39    NIDiasABP__standard_deviation    0.018582
24         HCO3__standard_deviation    0.018564
58         Temp__standard_deviation    0

In [6]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # probability of class=1

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")

Accuracy: 0.861
AUC: 0.520


## With not scaled dataset - AUC Much better
Accuracy: 0.861
AUC: 0.826

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute
import numpy as np
import pandas as pd
# Extracting features of concatenated training and test dataset (need to do this in one go so the feature extraction is consistent)
df_train = pd.read_parquet('processed-data/processed-set-a.parquet')
df_test = pd.read_parquet('processed-data/processed-set-c.parquet').drop(columns=["ICUType"])

print(df_train.shape)
print(df_test.shape)

df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
# For most param - distribution is normal distribution 
custom_fc_parameters = {
    "mean": None,
    "standard_deviation": None,
}

# things that are not : AST, ALT,Bilirubin- spike at lower values, high is bad 
# GCS means alert - low is unconscious 
# MechVent 1 is bad - on ventilator 
kind_to_fc_parameters = {
    "Weight": {"mean":None},
    "Age":{"maximum":None},
    "Gender":{"maximum":None},
    "Height":{"maximum":None},
    "MechVent":{"maximum":None},
    "ALT":{"maximum":None},
    "AST":{"maximum":None},
    "Bilirubin":{"maximum":None},
    "GCS":{"minimum":None},
    "Lactate":{"maximum":None},
    "Pa02":{
        "mean": None,
        "variance": None,
    },
    "PaCO2":{
        "mean": None,
        "variance": None,
    },
    "FiO2":{
        "mean": None,
        "variance": None,
    },
    "Glucose":{
        "sum_values":None,
        "mean": None,
        "standard_deviation": None,
    },
    "Platelets":{
        "mean": None,
        "variance": None,
        "maximum":None
    },
    "SaO2":{"minimum":None},
    "TroponinI" :{"maximum":None},
    "TroponinT": {"maximum":None},
    "Urine":{
        "mean": None,
        "standard_deviation": None,
        "sum_values":None
    },
    "RespRate":{
        "variance":None
    }, 
    "HR":{
        "variance":None,
        "maximum":None
    }, "BP":{
    "variance":None,
        "maximum":None
    }
}
X = extract_features(df_train, column_id='RecordID', column_sort='Time', default_fc_parameters=custom_fc_parameters, kind_to_fc_parameters=kind_to_fc_parameters, impute_function=impute)
X= X[sorted(X.columns)]
y = pd.read_parquet('processed-data/processed-outcomes-a.parquet')["In-hospital_death"].to_numpy().flatten()


(196000, 43)
(196000, 42)


Feature Extraction: 100%|██████████| 50/50 [00:25<00:00,  1.97it/s]


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

# Initialize and fit a Random Forest model
rf = RandomForestClassifier(n_estimators=100, 
                            random_state=42,
                            n_jobs=-1)  # use all cores for speed
rf.fit(X_train, y_train)

In [10]:
importances = rf.feature_importances_

# Combine with feature names into a DataFrame
feature_importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': importances
})

# Sort by descending importance
feature_importance_df.sort_values(by='importance', ascending=False, inplace=True)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Now print the DataFrame in full
print(feature_importance_df)

                            feature  importance
63                      Urine__mean    0.031405
65                Urine__sum_values    0.031109
8           BUN__standard_deviation    0.030890
64        Urine__standard_deviation    0.024163
16                       FiO2__mean    0.024100
17                   FiO2__variance    0.022910
7                         BUN__mean    0.022765
4                      Age__maximum    0.022708
24         HCO3__standard_deviation    0.020973
27                      HR__maximum    0.020912
47           Na__standard_deviation    0.020276
9                Bilirubin__maximum    0.019883
34                 Lactate__maximum    0.019659
21      Glucose__standard_deviation    0.019446
67          WBC__standard_deviation    0.019027
52               Platelets__maximum    0.018549
26          HCT__standard_deviation    0.018066
68                     Weight__mean    0.017937
13   Creatinine__standard_deviation    0.017880
20                    Glucose__mean    0

In [11]:
from sklearn.metrics import accuracy_score, roc_auc_score

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # probability of class=1

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_prob)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")

Accuracy: 0.861
AUC: 0.826
