In [None]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
#for colab reading files
path = 'drive/My Drive/Projects/EHR_record/'


In [None]:
path=""

In [None]:
demographic_train = pd.read_csv(path+'Train/demographics.csv')
labs_train = pd.read_csv(path+'Train/labs.csv')
vitals_train = pd.read_csv(path+'Train/vitals.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
demographic_test = pd.read_csv(path+'Test/demographics.csv')
labs_test = pd.read_csv(path+'Test/labs.csv')
vitals_test = pd.read_csv(path+'Test/vitals.csv')

## 1. Exloratory analysis

In [None]:
print(demographic_train.describe())
print(labs_train.describe())
print(vitals_train.describe())

In [None]:
print("size of demographic:", demographic_train.shape)
print("size of vitals:", vitals_train.shape)
print("size of labs:", labs_train.shape)

In [None]:
# Show null values in a dataframe
print(demographic_train.isnull().sum())
print(labs_train.isnull().sum())
print(vitals_train.isnull().sum())

1.1 handling numerical null values

## 2. Data cleaning

2.1 Data merging

In [None]:
# Define merge dataset function
def merge_datasets(df1, df2, df3, key):

    merged_df = pd.merge(df1, df2, on=key, how='inner')
    merged_df = pd.merge(merged_df, df3, on=key, how='inner')
    return merged_df

In [None]:
train = merge_datasets(demographic_train,vitals_train,labs_train,'patient_id')
test = merge_datasets(demographic_test,vitals_test,labs_test,'patient_id')
y_train = train['hospital_death']
X_train = train.drop('hospital_death', axis=1)


2.2 Data cleaning

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
# get the numerical columns
numerical_features = X_train.select_dtypes(include=['int64', 'float64']).columns

# get the categorical columns
categorical_features = X_train.select_dtypes(include=['object']).columns

In [None]:
#Pipeline for numerical data
Numerical_transformer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ]
)

In [None]:
#Pipeline for categorical data
Categorical_transofrmer = Pipeline(
    steps = [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder())
    ]
)

In [None]:
#preprocessor to transform data
preprocessor = ColumnTransformer(
    transformers=[
        ("numerical", Numerical_transformer, numerical_features),
        ("categorical", Categorical_transofrmer, categorical_features)
    ]
)

In [None]:
processor_pipeline = Pipeline(
                steps=[('preprocessor',preprocessor)]
            )


In [None]:
#use merge dataset function
train_processed = processor_pipeline.fit_transform(X_train)
test_processed = processor_pipeline.transform(test)

3. Fit on models

In [None]:
#sampling data, straitify on the target column, also split group columns
from sklearn.model_selection import train_test_split

class StratifiedGroupSampler:
    def __init__(self, df, target_col, group_col, fraction, random_state=42):
        self.df = df
        self.target_col = target_col
        self.group_col = group_col
        self.fraction = fraction
        self.random_state = random_state

    def sample(self):
        sampled_df = pd.DataFrame()

        for target in self.df[self.target_col].unique():
            group_ids = self.df[self.df[self.target_col] == target][self.group_col].unique()
            _, sampled_ids = train_test_split(group_ids, test_size=self.fraction, random_state=self.random_state)

            sampled_data = self.df[self.df[self.group_col].isin(sampled_ids)]
            sampled_df = pd.concat([sampled_df, sampled_data])

        return sampled_df



In [None]:
def sample_data(df, fraction):

    sampler = StratifiedGroupSampler(df, 'hospital_death', 'patient_id', fraction)
    sampled_train = sampler.sample()
    X_sampled = sampled_train.drop('hospital_death', axis = 1)
    y_sampled = sampled_train['hospital_death']


    return X_sampled, y_sampled

In [None]:
import wandb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import classification_report

def sample_and_validate(pipeline, train_data, fraction):
    # Sample and process the data
    X_sampled, y_sampled = sample_data(train_data, fraction)
    X_processed_sample = preprocessor.fit_transform(X_sampled)

    # Define scoring dictionary
    scoring = {
        'precision': 'precision',
        'recall': 'recall',
        'f1_score': 'f1'
    }

    # Calculate cross-validation scores
    cv_scores = cross_validate(pipeline, X_processed_sample, y_sampled, cv=5, scoring=scoring)

    return cv_scores


# Define the pipelines to try
pipelines = [
    {"name": "Logistic Regression", "pipeline": Pipeline([('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))]), "params": {'max_iter': 1000}},
    {"name": "Random Forest", "pipeline": Pipeline([('classifier', RandomForestClassifier())]), "params": {}},
    {"name": "Gradient Boost", "pipeline": Pipeline([('classifier', GradientBoostingClassifier())]), "params": {}},
]

# Define the fractions to try
fractions = [0.1, 0.2, 0.3]

# Initialize W&B
wandb.init(project="EHR_record")

# Loop over the pipelines and fractions
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Loop over the pipelines and fractions
for pipeline_dict in pipelines:
    for fraction in fractions:
        print(f"Training {pipeline_dict['name']} with fraction {fraction}")
        X_sampled, y_sampled = sample_data(train_data, fraction)
        X_processed_sample = preprocessor.fit_transform(X_sampled)
        X_train, X_test, y_train, y_test = train_test_split(X_processed_sample, y_sampled, test_size=0.2)

        pipeline = pipeline_dict['pipeline']
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        cv_scores = sample_and_validate(pipeline, train, fraction)

        # Compute and plot the confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(10,7))
        sns.heatmap(cm, annot=True, cmap='Blues')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title(f"Confusion Matrix for {pipeline_dict['name']} with fraction {fraction}")

        # Log the plot to W&B
        wandb.log({
            "confusion_matrix": wandb.Image(plt),
            "precision": cv_scores['test_precision'].mean(),
            "recall": cv_scores['test_recall'].mean(),
            "f1_score": cv_scores['test_f1_score'].mean(),
            "fraction": fraction,
            "model_name": pipeline_dict['name'],
            **pipeline_dict['params']
        })
        plt.close()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def plot_confusion_matrix(cv_scores):
    # Calculate mean of confusion matrix components
    tn_mean = np.mean(cv_scores['test_tn'])
    fp_mean = np.mean(cv_scores['test_fp'])
    fn_mean = np.mean(cv_scores['test_fn'])
    tp_mean = np.mean(cv_scores['test_tp'])

    # Create confusion matrix with mean values
    cm = np.array([[tn_mean, fp_mean],
                   [fn_mean, tp_mean]])

    # Plot confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='.0f', cmap='Blues',
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['Actual Negative', 'Actual Positive'])
    plt.title('Confusion Matrix')
    plt.show()




2.2 Check the linearality of predictors

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix
corr = X_train.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.savefig('correlation graph.png')
plt.show()
