In [33]:
import pandas as pd
import numpy as np
import nltk
import gensim
import tensorflow as tf
import random

from datetime import timedelta
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.utils import resample
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report


# Setup seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Load pre-trained PubMed word2vec vectors
w2v_model = gensim.models.KeyedVectors.load_word2vec_format("/kaggle/input/pubmed-w2v/PubMed-w2v.bin", binary=True)

In [34]:
# Load the data
admissions = pd.read_csv('/kaggle/input/cs598-heart-failure-data/ADMISSIONS.csv')
notes = pd.read_csv('/kaggle/input/cs598-heart-failure-data/NOTEEVENTS.csv')
diag = pd.read_csv('/kaggle/input/cs598-heart-failure-data/DIAGNOSES_ICD.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [35]:
# List of ICD-9 codes for heart failure
heart_failure_codes = ['39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '4280', '4281', '42820', '42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843', '4289']

# Filter heart failure admissions based on ICD-9 codes
heart_failure_admissions = diag[diag['ICD9_CODE'].isin(heart_failure_codes)]

# Merge heart failure admissions with notes and filter discharge summaries
hf_admissions_notes = heart_failure_admissions.merge(notes, on=['HADM_ID', 'SUBJECT_ID'])
hf_admissions_discharge_summaries = hf_admissions_notes[hf_admissions_notes['CATEGORY'] == 'Discharge summary']

# Keep only the longest discharge summary for each admission
hf_admissions_discharge_summaries['TEXT_LENGTH'] = hf_admissions_discharge_summaries['TEXT'].str.len()
hf_admissions_discharge_summaries = hf_admissions_discharge_summaries.loc[hf_admissions_discharge_summaries.groupby('HADM_ID')['TEXT_LENGTH'].idxmax()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == "":


In [36]:
hf_admissions_discharge_summaries = pd.merge(hf_admissions_discharge_summaries, admissions[['HADM_ID', 'ADMITTIME', 'DISCHTIME']], on='HADM_ID')
hf_admissions = pd.merge(heart_failure_admissions, admissions[['HADM_ID', 'ADMITTIME', 'DISCHTIME']], on='HADM_ID')

In [37]:
# Function to label general readmissions and 30-day readmissions
def label_readmissions(df):
    df['ADMITTIME'] = pd.to_datetime(df['ADMITTIME'])
    df['DISCHTIME'] = pd.to_datetime(df['DISCHTIME'])
    df.sort_values(['SUBJECT_ID', 'ADMITTIME'], inplace=True)
    df['READMISSION_TIME'] = df.groupby('SUBJECT_ID')['ADMITTIME'].diff().dt.total_seconds().div(60)

    df['GENERAL_READMISSION'] = df['READMISSION_TIME'].apply(lambda x: 1 if x > 0 else 0)
    df['30_DAY_READMISSION'] = df['READMISSION_TIME'].apply(lambda x: 1 if 0 < x <= 30 * 24 * 60 else 0)
    
    return df

# B. Labeling Readmissions (with discharge summaries)
hf_admissions_discharge_summaries = label_readmissions(hf_admissions_discharge_summaries)

# B. Labeling Readmissions (without discharge summaries)
hf_admissions = label_readmissions(hf_admissions)




In [38]:
def split_train_test_dataset(df):
    # Separate positive and negative samples
    general_readmission_pos = df[df['GENERAL_READMISSION'] == 1]
    general_readmission_neg = df[df['GENERAL_READMISSION'] == 0]
    thirty_day_readmission_pos = df[df['30_DAY_READMISSION'] == 1]
    thirty_day_readmission_neg = df[df['30_DAY_READMISSION'] == 0]

    # Perform under-sampling
    general_readmission_neg = general_readmission_neg.sample(len(general_readmission_pos))
    thirty_day_readmission_neg = thirty_day_readmission_neg.sample(len(thirty_day_readmission_pos))

    # Combine and shuffle the balanced samples
    general_readmission_data = pd.concat([general_readmission_pos, general_readmission_neg]).sample(frac=1).reset_index(drop=True)
    thirty_day_readmission_data = pd.concat([thirty_day_readmission_pos, thirty_day_readmission_neg]).sample(frac=1).reset_index(drop=True)

    # Split the data into training and test sets (90-10)
    train_general, test_general = train_test_split(general_readmission_data, test_size=0.1, random_state=SEED)
    train_thirty_day, test_thirty_day = train_test_split(thirty_day_readmission_data, test_size=0.1, random_state=SEED)
    
    # Perform 10-fold cross-validation on the training data
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    train_general_cv_splits = list(skf.split(train_general.drop('GENERAL_READMISSION', axis=1), train_general['GENERAL_READMISSION']))
    train_thirty_day_cv_splits = list(skf.split(train_thirty_day.drop('30_DAY_READMISSION', axis=1), train_thirty_day['30_DAY_READMISSION']))
    
    return train_general, test_general, train_thirty_day, test_thirty_day, train_general_cv_splits, train_thirty_day_cv_splits

train_general_ds, test_general_ds, train_thirty_day_ds, test_thirty_day_ds, train_general_ds_cv_splits, train_thirty_day_ds_cv_splits = split_train_test_dataset(hf_admissions_discharge_summaries)


In [39]:
with open("/kaggle/input/nltk-data/nltk_data/corpora/stopwords/english", "r") as f:
    stop_words = [line.strip() for line in f]

In [40]:
## Preprocessing the text: tokenize first,then remove stopwords and non-alphabetic tokens in text(like numbers), and re-join all tokens into the text

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    return " ".join(tokens)

# Preprocess text for general readmission
train_general_ds["TEXT"] = train_general_ds["TEXT"].apply(preprocess_text)
test_general_ds["TEXT"] = test_general_ds["TEXT"].apply(preprocess_text)

# Preprocess text for 30-day readmission
train_thirty_day_ds["TEXT"] = train_thirty_day_ds["TEXT"].apply(preprocess_text)
test_thirty_day_ds["TEXT"] = test_thirty_day_ds["TEXT"].apply(preprocess_text)


In [41]:
# Tokenize words in text and padding each text to max_length
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_general_ds["TEXT"]) # fit_on_texts creates the vocabulary index based on word frequency. 
vocab_size = len(tokenizer.word_index) + 1
max_length = 256

X_train = pad_sequences(tokenizer.texts_to_sequences(train_general_ds["TEXT"]), maxlen=max_length)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_general_ds["TEXT"]), maxlen=max_length)

y_train = train_general_ds["GENERAL_READMISSION"]
y_test = test_general_ds["GENERAL_READMISSION"]

# Tokenize and pad sequences for 30-day readmission
X_train_30d = pad_sequences(tokenizer.texts_to_sequences(train_thirty_day_ds["TEXT"]), maxlen=max_length)
X_test_30d = pad_sequences(tokenizer.texts_to_sequences(test_thirty_day_ds["TEXT"]), maxlen=max_length)

y_train_30d = train_thirty_day_ds["30_DAY_READMISSION"]
y_test_30d = test_thirty_day_ds["30_DAY_READMISSION"]


In [42]:
# Create a weight matrix for words in the training set
embedding_dim = 200  # Change this to the dimension of the pre-trained vectors in word2vec
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, index in tokenizer.word_index.items():
    if word in w2v_model:
        embedding_vector = w2v_model[word]
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [43]:
from tensorflow.keras.layers import MaxPooling1D, Dropout, Concatenate, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.models import Model
from sklearn.metrics import classification_report

# Define the CNN model
def create_model():
    inputs = Input(shape=(max_length,))
    
    embedded_sequences = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_length, trainable=True)(inputs)

    conv_outputs = []
    for filter_size in [1, 2, 3]:
        conv = Conv1D(128, filter_size, activation='relu', kernel_regularizer=l2(0.001))(embedded_sequences)
        conv = MaxPooling1D(pool_size=2)(conv)
        conv_outputs.append(conv)

    concatenated = Concatenate(axis=1)(conv_outputs)

    conv1 = Conv1D(64, 5, activation='relu', kernel_regularizer=l2(0.001))(concatenated)
    conv1 = MaxPooling1D(pool_size=2)(conv1)
    
    conv2 = Conv1D(32, 5, activation='relu', kernel_regularizer=l2(0.001))(conv1)
    pooled = GlobalMaxPooling1D()(conv2)
    
    dense1 = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(pooled)
    dropout1 = Dropout(0.6)(dense1)
    
    dense2 = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(dropout1)
    dropout2 = Dropout(0.6)(dense2)
    
    outputs = Dense(1, activation='sigmoid')(dropout2)

    model = Model(inputs=inputs, outputs=outputs)

    # Compile the model
    optimizer = Adam(lr=0.0005)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Set up the callbacks
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)


In [44]:
accuracy_general_cv = []
precision_general_cv = []
recall_general_cv = []
f1_general_cv = []

for train_index, val_index in train_general_ds_cv_splits:
    X_train_cv = X_train[train_index]
    X_val_cv = X_train[val_index]
    y_train_cv = y_train.values[train_index]
    y_val_cv = y_train.values[val_index]

    model_general_cv = create_model()

    history_general_cv = model_general_cv.fit(
        X_train_cv, y_train_cv,
        epochs=20,
        batch_size=40,
        validation_data=(X_val_cv, y_val_cv),
        callbacks=[reduce_lr, early_stopping]
    )

    y_pred_general_cv = model_general_cv.predict(X_val_cv)
    y_pred_general_cv = [1 if p >= 0.5 else 0 for p in y_pred_general_cv]

    report_general_cv = classification_report(y_val_cv, y_pred_general_cv, output_dict=True)

    accuracy_general_cv.append(report_general_cv['accuracy'])
    precision_general_cv.append(report_general_cv['1']['precision'])
    recall_general_cv.append(report_general_cv['1']['recall'])
    f1_general_cv.append(report_general_cv['1']['f1-score'])

print(f"General readmission - Mean CV accuracy: {np.mean(accuracy_general_cv):.4f}")
print(f"General readmission - Mean CV precision: {np.mean(precision_general_cv):.4f}")
print(f"General readmission - Mean CV recall: {np.mean(recall_general_cv):.4f}")
print(f"General readmission - Mean CV F1 score: {np.mean(f1_general_cv):.4f}")


In [45]:
# Create, train, and evaluate the model for 30-day readmission using 10-fold cross-validation
accuracy_30d_cv = []
precision_30d_cv = []
recall_30d_cv = []
f1_30d_cv = []

for train_index, val_index in train_thirty_day_ds_cv_splits:
    X_train_30d_cv = X_train_30d[train_index]
    X_val_30d_cv = X_train_30d[val_index]
    y_train_30d_cv = y_train_30d.values[train_index]
    y_val_30d_cv = y_train_30d.values[val_index]

    model_30d_cv = create_model()

    history_30d_cv = model_30d_cv.fit(
        X_train_30d_cv, y_train_30d_cv,
        epochs=20,
        batch_size=45,
        validation_data=(X_val_30d_cv, y_val_30d_cv),
        callbacks=[reduce_lr, early_stopping]
    )

    y_pred_30d_cv = model_30d_cv.predict(X_val_30d_cv)
    y_pred_30d_cv = [1 if p >= 0.5 else 0 for p in y_pred_30d_cv]

    report_30d_cv = classification_report(y_val_30d_cv, y_pred_30d_cv, output_dict=True)

    accuracy_30d_cv.append(report_30d_cv['accuracy'])
    precision_30d_cv.append(report_30d_cv['1']['precision'])
    recall_30d_cv.append(report_30d_cv['1']['recall'])
    f1_30d_cv.append(report_30d_cv['1']['f1-score'])

print(f"30-day readmission - Mean CV accuracy: {np.mean(accuracy_30d_cv):.4f}")
print(f"30-day readmission - Mean CV precision: {np.mean(precision_30d_cv):.4f}")
print(f"30-day readmission - Mean CV recall: {np.mean(recall_30d_cv):.4f}")
print(f"30-day readmission - Mean CV F1 score: {np.mean(f1_30d_cv):.4f}")


In [46]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import chi2

reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}

def revert_to_text(encoded_text):
    return ' '.join([reverse_word_index[index] for index in encoded_text if index != 0])

X_train_text = [revert_to_text(encoded_text) for encoded_text in X_train]

vectorizer = CountVectorizer(tokenizer=nltk.word_tokenize, stop_words='english')
X_chi2 = vectorizer.fit_transform(X_train_text)

# For general readmission
y_general_chi2 = train_general_ds['GENERAL_READMISSION']
chi2_general_score, p_general_value = chi2(X_chi2, y_general_chi2)

# For 30-day readmission
y_30day_chi2 = train_general_ds['30_DAY_READMISSION']
chi2_30day_score, p_30day_value = chi2(X_chi2, y_30day_chi2)

feature_names = vectorizer.get_feature_names()

# Top 20 words for general readmission
chi2_general_sorted_indices = chi2_general_score.argsort()[::-1]
top_20_general_words = [feature_names[i] for i in chi2_general_sorted_indices[:20]]

# Top 20 words for 30-day readmission
chi2_30day_sorted_indices = chi2_30day_score.argsort()[::-1]
top_20_30day_words = [feature_names[i] for i in chi2_30day_sorted_indices[:20]]

print("Top 20 words related to general readmission:")
print(top_20_general_words)

print("\nTop 20 words related to 30-day readmission:")
print(top_20_30day_words)


In [66]:
## Random Forest model

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Extract features from clinical notes using TF-IDF vectorizer for general readmission
vectorizer = TfidfVectorizer(analyzer='word')
X_train = vectorizer.fit_transform(train_general_ds["TEXT"]).toarray()
X_test = vectorizer.transform(test_general_ds["TEXT"]).toarray()

y_train = train_general_ds["GENERAL_READMISSION"]
y_test = test_general_ds["GENERAL_READMISSION"]

# Train the Random Forest model for general readmission
model = RandomForestClassifier(n_estimators=40, random_state=42)
model.fit(X_train, y_train)

# predict on test data for general readmission
y_pred = model.predict(X_test)


print(f"General readmission - Accuracy Score: {accuracy_score(y_test, y_pred)}")
print(f"General readmission - Precision Score: {precision_score(y_test, y_pred,average='weighted')}")
print(f"General readmission - Recall Score: {recall_score(y_test, y_pred,average='weighted')}")
print(f"General readmission - F1 Score: {f1_score(y_test, y_pred,average='weighted')}")




General readmission - Accuracy Score: 0.6847360912981455
General readmission - Precision Score: 0.6854072413558862
General readmission - Recall Score: 0.6847360912981455
General readmission - F1 Score: 0.6848362042120734


In [59]:
# Extract features from clinical notes using TF-IDF vectorizer for 30-day readmission
vectorizer_30 = TfidfVectorizer(analyzer='word')
X_train_30 = vectorizer_30.fit_transform(train_thirty_day_ds["TEXT"]).toarray()
X_test_30 = vectorizer_30.transform(test_thirty_day_ds["TEXT"]).toarray()

y_train_30 = train_general_ds["30_DAY_READMISSION"]
y_test_30 = test_general_ds["30_DAY_READMISSION"]

# Train the Random Forest model for 30-day readmission
model_30 = RandomForestClassifier(n_estimators=40, random_state=100)
model_30.fit(X_train, y_train)

# predict on test data for 30-day readmission
y_pred_30 = model.predict(X_test)

print(f"30-day readmission - Accuracy Score: {accuracy_score(y_test_30, y_pred_30)}")
print(f"30-day readmission - Precision Score: {precision_score(y_test_30, y_pred_30,average='weighted')}")
print(f"30-day readmission - Recall Score: {recall_score(y_test_30, y_pred_30,average='weighted')}")
print(f"30-day readmission - F1 Score: {f1_score(y_test_30, y_pred_30,average='weighted')}")



30-day readmission - Accuracy Score: 0.5378031383737518
30-day readmission - Precision Score: 0.8619966243932007
30-day readmission - Recall Score: 0.5378031383737518
30-day readmission - F1 Score: 0.6274822207709208
