Importing Packages 

In [None]:
import pandas as pd
from pathlib import Path
import time
import tqdm
from datetime import datetime
import os
from sklearn.model_selection import KFold
import sys
import pytorch_lightning as plit
import pandas as pd
import math
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler, normalize, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_curve, confusion_matrix, roc_curve
from sklearn.metrics import auc as skauc
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
import numpy as np
from multiprocessing import Pool
import multiprocessing
from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from xgboost import XGBClassifier, XGBRegressor
import torch.nn as nn
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
import torch
import argparse
import dateutil
import shap
import re

Loading Prescriptions and Admissions File

In [None]:
#Load Prescriptions CSV File into Pandas Dataframe 
prescriptions_df = pd.read_csv("C:/Users/calin/Desktop/PRESCRIPTIONS.csv")
#Load Admissions CSV File into  Dataframe 
admissions_df = pd.read_csv("C:/Users/calin/Desktop/ADMISSIONS.csv",low_memory=False)


print("Admissions DataFrame:")
print(admissions_df.head())
print("\nPrescriptions DataFrame:")
print(prescriptions_df.head())

In [None]:
#CREATING NEW DATAFRAME AND DEFINING NEW FEATURES

#DOSAGE AS A FEATURE 
conversion_factors = {

    'mg': 1,
    'mcg': 0.001,
    'g' : 1000,
    'pill': 50,
    'tab': 50,
    'gm': 1000,
    'mEq': 74.5,
    'mL': 1,
    'UNIT': 100,
    'mcg/hr':0.001,
    'mg/hr':1,

}

def convert_to_mg(row):
    unit = row['DOSE_UNIT_RX']
    factor = conversion_factors.get(unit)
    dose_val_rx = row['DOSE_VAL_RX']

    if factor is not None and isinstance(dose_val_rx, (int,float)):
        return dose_val_rx * factor
    else: 
        return dose_val_rx #return original dose value
    
prescriptions_df['dose_mg'] = prescriptions_df.apply(convert_to_mg, axis=1)


if prescriptions_df['dose_mg'].isnull().any():
    prescriptions_df['dose_mg'] = prescriptions_df['dose_mg'].fillna(0)

prescriptions_df['dose_mg'] = prescriptions_df.apply(convert_to_mg, axis=1)

# Fill NaN values in 'dose_mg' with 0
prescriptions_df['dose_mg'] = prescriptions_df['dose_mg'].fillna(0)

#GROUP BY UNIQUE SUBJECT ID 

# Group by unique patient ID and pivot the table to have each drug as a separate column
grouped_df = prescriptions_df.groupby(['SUBJECT_ID', 'DRUG']).size().unstack(fill_value=0)

# Reset the index to convert 'SUBJECT_ID' back to a regular column
grouped_df.reset_index(inplace=True)

# Create a new DataFrame to store the final result
final_df = grouped_df.copy()


#EACH DRUG AS AN INDIVIDUAL FEATURE 
# Iterate over each column (drug) in the grouped DataFrame
for drug in grouped_df.columns:
    # Create a new column for the drug with binary indicators (0 or 1)
    final_df[drug] = grouped_df[drug]
    
    # Create a new column for the dose of the drug if taken
    final_df[f"{drug.strip('_')}_dose"] = prescriptions_df[prescriptions_df['DRUG'] == drug].groupby('SUBJECT_ID')['dose_mg'].sum()

# Fill NaN values with 0 for drugs that were not taken
final_df.fillna(0, inplace=True)

# Add a column for the number of unique drugs taken
final_df['num_unique_drugs'] = grouped_df.gt(0).sum(axis=1)

# Add a column indicating if any drugs were taken (1 if any drug taken, 0 otherwise)
final_df['any_drug_taken'] = final_df.iloc[:, :-1].any(axis=1).astype(int)

print(final_df.head())

In [None]:
#ADMISSIONS FILE 

admissions_df['ADMITTIME'] = pd.to_datetime(admissions_df['ADMITTIME'],)
admissions_df['DISCHTIME'] = pd.to_datetime(admissions_df['DISCHTIME'])
admissions_df['DEATHTIME'] = pd.to_datetime(admissions_df['DEATHTIME'],)
    
# New logic to filter patients based on discharge/death time within the timeframe
admit_times = admissions_df['ADMITTIME']
mapping_discharge = dict(zip(admissions_df['SUBJECT_ID'], admissions_df['ADMITTIME']))
discharge_times = admissions_df['DISCHTIME']
death_times = admissions_df['DEATHTIME']

admit_times = admissions_df['ADMITTIME']
discharge_times = admissions_df['DISCHTIME']
death_times = admissions_df['DEATHTIME']
# Make sure 'ADMITTIME' and 'DOB' are in datetime format
admissions_df['ADMITTIME'] = pd.to_datetime(admissions_df['ADMITTIME'])


#MERGING PRESCRIPTIONS WITH ADMISSIONS FILES 
merged_df = pd.merge(final_df, admissions_df, on='SUBJECT_ID', how='inner')

print("\nMergedDataframe:")
print(merged_df.head())

merged_df['mortality_label'] = 0 
merged_df.loc[merged_df['dischtime'].notnull(), 'mortality_label'] = 0 #Patient who survived 0

print(merged_df.head())

duplicate_subject_ids_admissions = admissions_df[admissions_df.duplicated('SUBJECT_ID', keep=False)]

# Check for duplicate subject IDs in final_df
duplicate_subject_ids_final = final_df[final_df.duplicated('SUBJECT_ID', keep=False)]

# Check for duplicate subject IDs in merged_df
duplicate_subject_ids_merged = merged_df[merged_df.duplicated('SUBJECT_ID', keep=False)]

# Print the lengths of the original dataframes and the merged dataframe
print("Length of admissions_df:", len(admissions_df))
print("Length of final_df:", len(final_df))
print("Length of merged_df:", len(merged_df))

# Print the number of duplicate subject IDs in each dataframe
print("Number of duplicate subject IDs in admissions_df:", len(duplicate_subject_ids_admissions))
print("Number of duplicate subject IDs in final_df:", len(duplicate_subject_ids_final))
print("Number of duplicate subject IDs in merged_df:", len(duplicate_subject_ids_merged))

# Optionally, print the duplicate subject IDs for further investigation
print("Duplicate subject IDs in admissions_df:", duplicate_subject_ids_admissions['SUBJECT_ID'].unique())
print("Duplicate subject IDs in final_df:", duplicate_subject_ids_final['SUBJECT_ID'].unique())
print("Duplicate subject IDs in merged_df:", duplicate_subject_ids_merged['SUBJECT_ID'].unique())



merged_df = merged_df[merged_df['SUBJECT_ID'].isin(admissions_df['SUBJECT_ID'])]
merged_df.drop_duplicates(subset='SUBJECT_ID', keep=False, inplace=True)


In [None]:
#FEATURE ANALYSIS + FILTRATION


def analyze_features(features_scaled, # Feature dataframe
                     feature_names, # Binary - on drug or not on drug EVERY DRUG NAME [i for i in features.columns if 'dose' in i] 
                     y, # Mortality labels 
                     top_n_features=10, # What features will be plotted (shap, LR, PCA) 
                     return_top_features = False ): # Return a list of filtered feature name  
    """
    Analyze features using SHAP values, Linear Regression, and Random Forest, 
    select the top features from each method without duplicates, and visualize the results.

    :param features_scaled: Scaled feature numpy array
    :param feature_names: List of feature names
    :param y: True labels
    :param top_n_features: Number of top features to retain from each method
    """
    # Trains XGBoost classifier for SHAP
    xgb_model = XGBClassifier()
    xgb_model.fit(features_scaled, y)
    
    # Trains Random Forest classifier
    rf_model = RandomForestClassifier()
    rf_model.fit(features_scaled, y)

    # Linear Regression for feature importance
    lr_model = LinearRegression()
    lr_model.fit(features_scaled, y)

    # SHAP values
    explainer_shap = shap.Explainer(xgb_model)
    shap_values = explainer_shap(features_scaled)
    sampled_features = shap.sample(features_scaled, 1000)
    explainer_tree = shap.TreeExplainer(xgb_model)
    shap_values_tree = explainer_tree.shap_values(sampled_features)

    # Feature importance from SHAP
    feature_importance_shap = np.abs(shap_values.values).mean(axis=0)
    top_features_shap = np.argsort(feature_importance_shap)[-top_n_features:]

    # Feature importance from RF
    feature_importance_rf = rf_model.feature_importances_
    top_features_rf = np.argsort(feature_importance_rf)[-top_n_features:]

    # Feature importance from LR
    feature_importance_lr = np.abs(lr_model.coef_)
    top_features_lr = np.argsort(feature_importance_lr)[-top_n_features:]

    # Visualization
    
    # Visualize SHAP values using summary plot
    shap.summary_plot(shap_values_tree, sampled_features, feature_names=feature_names)

    # Print feature importance based on SHAP
    feature_importance_shap_tree = np.abs(shap_values_tree).mean(axis=0)
    feature_importance_shap_tree = pd.Series(feature_importance_shap_tree, index=feature_names).sort_values(ascending=False)
    print("\nFeature importance based on SHAP TreeExplainer:")
    print(feature_importance_shap_tree.head(top_n_features))
    print("\n" + "-"*50 + "\n")

    # RF
    plt.figure(figsize=(10, 6))
    indices = np.argsort(feature_importance_rf)[::-1][:top_n_features]
    plt.title("Top Random Forest Feature Importances")
    plt.barh(range(top_n_features), feature_importance_rf[indices], color='b', align='center')
    plt.yticks(range(top_n_features), [feature_names[i] for i in indices])
    plt.xscale('log')
    plt.gca().invert_yaxis()  # Invert y-axis to have largest values at the top

    # LR
    plt.figure(figsize=(10, 6))
    indices = np.argsort(feature_importance_lr)[::-1][:top_n_features]
    plt.title("Top Linear Regression Coefficients")
    plt.barh(range(top_n_features), feature_importance_lr[indices], color='b', align='center')
    plt.yticks(range(top_n_features), [feature_names[i] for i in indices])
    plt.xscale('log')
    plt.gca().invert_yaxis()  # Invert y-axis to have largest values at the top

    # Combine and remove duplicates
    combined_feature_indices = np.unique(np.concatenate((top_features_shap, top_features_rf, top_features_lr)))
    combined_feature_names = [feature_names[i] for i in combined_feature_indices]

    print(f"Number of selected features: {len(combined_feature_names)}")
    print("Selected features:", combined_feature_names)

    if return_top_features:
        return combined_feature_names

In [None]:
#Preprocessing & Scaling
    
#March 15

# Separate numeric and categorical features    
numeric_features = merged_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = merged_df.select_dtypes(include=['object']).columns.tolist()
binary_features = [col for col in merged_df.columns if merged_df[col].nunique() == 2]

# Remove 'mortality_label' from the selected features
selected_features = [col for col in numeric_features + categorical_features + binary_features if col != 'mortality_label']

# Filter explore_df to include only selected features
explore_df = merged_df[selected_features].copy()

# Clean numeric features
explore_df[numeric_features] = explore_df[numeric_features].applymap(lambda x: str(x).replace(',', '') if isinstance(x, str) else x)
explore_df[numeric_features] = explore_df[numeric_features].applymap(lambda x: re.sub(r'(\d+)-(\d+)', r'\2', str(x)) if isinstance(x, str) else x)

# Initialize transformers for preprocessing
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first')  # Dropping first category to avoid multicollinearity

# Initialize an empty DataFrame to store transformed data
X_processed = pd.DataFrame()

# Iterate through each column
for col in selected_features:
    # Apply StandardScaler to numeric columns
    if col in numeric_features:
        X_processed[col] = numeric_transformer.fit_transform(merged_df[[col]])
    # OneHotEncode categorical columns
    elif col in categorical_features:
        encoded_col = categorical_transformer.fit_transform(merged_df[[col]]).toarray()
        feature_names = [f'{col}_{i}' for i in range(encoded_col.shape[1])]
        X_processed[feature_names] = encoded_col
    # Leave binary columns unchanged
    elif col in binary_features:
        X_processed[col] = merged_df[col]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),  # Apply StandardScaler to numeric features
        ('cat', categorical_transformer, categorical_features),  # OneHotEncode categorical features
        ('binary', 'passthrough', binary_features)  # Leave binary features unchanged
    ],
    remainder='drop'  # Drop any columns not explicitly transformed
)


# Extract the feature names after preprocessing
numeric_feature_names = numeric_features


categorical_feature_names = []

for name, transformer, columns in preprocessor.transformers:
    if 'cat' in name:
        if isinstance(transformer, OneHotEncoder):
            try:
                feature_names = transformer.get_feature_names_out(columns)
                # Ensure feature names are unique
                unique_feature_names = [f"{name}_{i}" for i in range(len(feature_names))]
                categorical_feature_names.extend(unique_feature_names)
            except AttributeError:
                # Handle the case when OneHotEncoder is not fitted yet
                pass

# Fit and transform the features
X_processed = preprocessor.fit_transform(explore_df)
feature_names = numeric_feature_names + categorical_feature_names + binary_features

# Create a DataFrame with the scaled features
features_scaled = pd.DataFrame(X_processed, columns=feature_names)

# Now you can call your analyze_features function with the scaled features and mortality labels
analyze_features(features_scaled=features_scaled, feature_names=feature_names, y=merged_df['mortality_label'], top_n_features=10, return_top_features=False)