In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
#for colab reading files
path = 'drive/My Drive/Projects/EHR_record/'


In [None]:
path=""

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
demographic_train = pd.read_csv(path+'Train/demographics.csv')
labs_train = pd.read_csv(path+'Train/labs.csv')
vitals_train = pd.read_csv(path+'Train/vitals.csv')

In [None]:
demographic_test = pd.read_csv(path+'Test/demographics.csv')
labs_test = pd.read_csv(path+'Test/labs.csv')
vitals_test = pd.read_csv(path+'Test/vitals.csv')

In [None]:
demo_20 = demographic_train.sample(20).to_csv('demo.csv')
lab_20 = labs_train.sample(20).to_csv('labs.csv')
vital_20 = vitals_train.sample(20).to_csv('vitals.csv')

## 1. Exloratory analysis

In [5]:
def perform_eda(dataframes, names):
    for df, name in zip(dataframes, names):
        print(f"EDA for {name}:")
        print(df.describe())
        print(f"Size of {name}:", df.shape)
        print(f"Null values in {name}:\n", df.isnull().sum())
        print("\n")

# Usage:
dataframes = [demographic_train, labs_train, vitals_train]
names = ["demographic", "labs", "vitals"]
perform_eda(dataframes, names)


EDA for demographic:
          patient_id   hospital_id  hospital_death           age  \
count   64384.000000  64384.000000    64384.000000  61263.000000   
mean    65628.023003    104.891153        0.082179     62.475524   
std     37789.609303     61.335116        0.274639     16.788310   
min         1.000000      2.000000        0.000000     16.000000   
25%     32967.750000     51.000000        0.000000     53.000000   
50%     65571.500000    116.000000        0.000000     65.000000   
75%     98343.750000    158.000000        0.000000     75.000000   
max    131051.000000    204.000000        1.000000     89.000000   

                bmi  elective_surgery        height        weight  \
count  61554.000000      64384.000000  63477.000000  62004.000000   
mean      29.209720          0.185823    169.523406     83.984488   
std        8.353251          0.388966     10.780263     25.227411   
min       14.844926          0.000000    137.200000     38.600000   
25%       23.625053  

In [6]:
# Define merge dataset function
def merge_datasets(df1, df2, df3, key):

    merged_df = pd.merge(df1, df2, on=key, how='inner')
    merged_df = pd.merge(merged_df, df3, on=key, how='inner')
    return merged_df

In [7]:
train = merge_datasets(demographic_train,vitals_train,labs_train,'patient_id')
#test = merge_datasets(demographic_test,vitals_test,labs_test,'patient_id')
y_train = train['hospital_death']
X_train = train.drop('hospital_death', axis=1)


## 2. Data cleaning

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

3. Fit on models

In [9]:
#sampling data, straitify on the target column, also split group columns
from sklearn.model_selection import train_test_split

class StratifiedGroupSampler:
    def __init__(self, df, target_col, group_col, fraction, random_state=42):
        self.df = df
        self.target_col = target_col
        self.group_col = group_col
        self.fraction = fraction
        self.random_state = random_state

    def sample(self):
        sampled_df = pd.DataFrame()

        for target in self.df[self.target_col].unique():
            group_ids = self.df[self.df[self.target_col] == target][self.group_col].unique()
            _, sampled_ids = train_test_split(group_ids, test_size=self.fraction, random_state=self.random_state)

            sampled_data = self.df[self.df[self.group_col].isin(sampled_ids)]
            sampled_df = pd.concat([sampled_df, sampled_data])

        return sampled_df



In [10]:
def sample_data(df, fraction):

    sampler = StratifiedGroupSampler(df, 'hospital_death', 'patient_id', fraction)
    sampled_train = sampler.sample()
    X_sampled = sampled_train.drop('hospital_death', axis = 1)
    y_sampled = sampled_train['hospital_death']


    return X_sampled, y_sampled

In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Logistic Regressy hyperparamerters
logistic_params = {
    'classifier__max_iter': [100, 500, 1000]
}


# Random Forest hyperparameters
rf_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Gradient Boosting hyperparameters
gb_params = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__learning_rate': [0.01, 0.05, 0.1],
    'classifier__max_depth': [3, 5, 7]
}




In [12]:
%pip install wandb



In [13]:
import wandb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report

import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
# Get numerical features
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Get categorical features
categorical_features = X_train.select_dtypes(include=['object']).columns.tolist()


# Define the preprocessing steps with all pipeline
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine numerical and categorical preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the pipelines to try
pipelines = [
    {"name": "Logistic Regression", "pipeline": Pipeline([('preprocessor', preprocessor),
                                                          ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))]),
                                                            "params_grid": {'classifier__max_iter': [100, 500, 1000]}},
   {"name": "Random Forest", "pipeline": Pipeline([('preprocessor', preprocessor),
                                                   ('classifier', RandomForestClassifier())]),
                                                    "params_grid": rf_params},

    {"name": "Gradient Boost", "pipeline": Pipeline([('preprocessor', preprocessor),
                                                     ('classifier', GradientBoostingClassifier())]),
                                                     "params_grid": gb_params},

]


In [15]:


# Define the fractions to try
fractions = [0.1, 0.2, 0.3]

# Define scoring dictionary
scoring = {
    'precision': 'precision',
    'recall': 'recall',
    'f1_score': 'f1'
}

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import roc_curve, auc

# Helper function to fit the model with error handling
def fit_model(pipeline_dict, X_train, y_train):
    try:
        random_search = RandomizedSearchCV(pipeline_dict['pipeline'], pipeline_dict.get('params_grid', {}),
                                           n_iter=10, cv=5, scoring=scoring, refit=False, random_state=42)
        random_search.fit(X_train, y_train)
        return random_search
    except Exception as e:
        error_message = f"An error occurred while fitting the model {pipeline_dict['name']}: {e}"
        print(error_message)
        wandb.log({"error": error_message}) # Logging the error to W&B
        return None

# Helper function to evaluate the model with error handling
def evaluate_model(random_search, X_val, y_val, model_name, fraction):
    if random_search is None:
        error_message = f"Skipping evaluation for {model_name} due to an error during fitting."
        print(error_message)
        wandb.log({"error": error_message}) # Logging the error to W&B
        return

    try:
        y_pred = random_search.predict(X_val)
        y_prob = random_search.predict_proba(X_val)[:, 1]
        cm = confusion_matrix(y_val, y_pred)
        fpr, tpr, _ = roc_curve(y_val, y_prob)
        roc_auc = auc(fpr, tpr)
        log_results(random_search, cm, fpr, tpr, roc_auc, model_name, fraction)
    except Exception as e:
        error_message = f"An error occurred while evaluating the model {model_name}: {e}"
        print(error_message)
        wandb.log({"error": error_message}) # Logging the error to W&B

def log_results(random_search, cm, fpr, tpr, roc_auc, model_name, fraction):
    """Log results to W&B, including confusion matrix, ROC curve, and feature importances."""
    # Plot confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f"Confusion Matrix for {model_name} with fraction {fraction}")
    confusion_matrix_image = plt.gcf()
    plt.close()

    # Plot ROC curve
    plt.figure(figsize=(10, 7))
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name} with fraction {fraction}')
    plt.legend(loc="lower right")
    roc_curve_image = plt.gcf()
    plt.close()

    # Log the results
    wandb.log({
        "model_name": model_name,
        "best_params": random_search.best_params_,
        "best_score": random_search.best_score_,
        "confusion_matrix": wandb.Image(confusion_matrix_image),
        "roc_curve": wandb.Image(roc_curve_image),
        "precision": random_search.cv_results_['mean_test_precision'].mean(),
        "recall": random_search.cv_results_['mean_test_recall'].mean(),
        "f1_score": random_search.cv_results_['mean_test_f1_score'].mean(),
        "fraction": fraction
    })


In [17]:
def train_and_evaluate(pipelines, fractions, train_data):
    # Initialize W&B
    wandb.init(project="EHR_record")

    # Loop over the pipelines and fractions
    for pipeline_dict in pipelines:
        for fraction in fractions:
            print(f"Training {pipeline_dict['name']} with fraction {fraction}")
            X_sampled, y_sampled = sample_data(train_data, fraction)
            X_train, X_val, y_train, y_val = train_test_split(X_sampled, y_sampled, test_size=0.2)

            # Fit the model
            random_search = fit_model(pipeline_dict, X_train, y_train)

            # Evaluate the model
            evaluate_model(random_search, X_val, y_val, pipeline_dict['name'], fraction)



In [None]:

#Train and evaluate model
train_and_evaluate(pipelines, fractions, train)

[34m[1mwandb[0m: Currently logged in as: [33mxin0558[0m. Use [1m`wandb login --relogin`[0m to force relogin


Training Logistic Regression with fraction 0.1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

An error occurred while evaluating the model Logistic Regression: This RandomizedSearchCV instance was initialized with `refit=False`. predict is available only after refitting on the best parameters. You can refit an estimator manually using the `best_params_` attribute
Training Logistic Regression with fraction 0.2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt