In [3]:
import numpy as np
import pandas as pd
import glob
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [5]:
# Step 1: Load and Preprocess Data
def load_data_from_psv(folder_path):
    """Load all .psv files and concatenate into a single DataFrame."""
    all_files = glob.glob(os.path.join(folder_path, '*.psv'))
    data_list = []
    for file in all_files:
        df = pd.read_csv(file, sep='|')
        data_list.append(df)
    return pd.concat(data_list, ignore_index=True)

# Replace with your folder path
data_folder = r'C:\Users\uSER\source\repos\alternative-assignment-aml\training_setA\training'
data = load_data_from_psv(data_folder)

In [6]:
# Step 2: Handle Missing Values
def preprocess_data(df):
    """Preprocess the data: handle missing values and normalize."""
    # Fill missing values (forward fill, then backward fill as fallback)
    df.fillna(method='ffill', inplace=True)
    df.fillna(method='bfill', inplace=True)
    df.fillna(0, inplace=True)  # Replace remaining NaNs with 0

    # Normalize continuous features
    scaler = MinMaxScaler()
    continuous_features = [col for col in df.columns if df[col].dtype in [np.float64, np.int64] and col != 'SepsisLabel']
    df[continuous_features] = scaler.fit_transform(df[continuous_features])

    return df, continuous_features

# Preprocess the data
data, feature_columns = preprocess_data(data)

  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


In [13]:
def create_sequences_optimized(df, features, target, seq_length):
    """Efficiently create sequences of data for LSTM input."""
    data_array = df[features].values  # Convert to NumPy array
    target_array = df[target].values  # Convert target to NumPy array

    # Ensure sufficient data for sequences
    if len(data_array) <= seq_length:
        raise ValueError("Data length must be greater than the sequence length.")

    num_samples = len(data_array) - seq_length + 1  # Total sequences
    X = np.lib.stride_tricks.sliding_window_view(data_array, (seq_length, len(features)))
    X = X.reshape(num_samples, seq_length, len(features))  # Adjust shape

    y = target_array[seq_length - 1:]  # Align target labels

    return X, y

# Define sequence length and prepare data
sequence_length = 10
X, y = create_sequences_optimized(data, feature_columns, 'SepsisLabel', sequence_length)




In [14]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Step 4: Build LSTM Model
def build_lstm_model(input_shape):
    """Build and compile an LSTM model."""
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),
        LSTM(128, return_sequences=True),
        Dropout(0.2),
        LSTM(64),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Build the model
input_shape = (X_train.shape[1], X_train.shape[2])
model = build_lstm_model(input_shape)

  super().__init__(**kwargs)


In [16]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=32)

Epoch 1/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 9ms/step - accuracy: 0.9785 - loss: 0.1001 - val_accuracy: 0.9780 - val_loss: 0.0949
Epoch 2/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 9ms/step - accuracy: 0.9787 - loss: 0.0932 - val_accuracy: 0.9780 - val_loss: 0.0931
Epoch 3/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m186s[0m 9ms/step - accuracy: 0.9784 - loss: 0.0931 - val_accuracy: 0.9780 - val_loss: 0.0937
Epoch 4/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 8ms/step - accuracy: 0.9784 - loss: 0.0921 - val_accuracy: 0.9780 - val_loss: 0.0928
Epoch 5/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 8ms/step - accuracy: 0.9783 - loss: 0.0920 - val_accuracy: 0.9780 - val_loss: 0.0921
Epoch 6/10
[1m19756/19756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 8ms/step - accuracy: 0.9781 - loss: 0.0918 - val_accuracy: 0.9780 - val_loss:

In [17]:
# Step 6: Evaluate the Model
def evaluate_model(model, X_test, y_test):
    """Evaluate the LSTM model."""
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

evaluate_model(model, X_test, y_test)

# Step 7: Save the Model
model.save('sepsis_lstm_model.h5')

[1m4939/4939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 3ms/step - accuracy: 0.9783 - loss: 0.0864




Test Loss: 0.08781257271766663, Test Accuracy: 0.977961540222168
