In [1]:
from io import StringIO
import sagemaker
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)

role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"
role_name = role.split('/')[-1]  # Extract just the role name from the ARN

# Attach AdministratorAccess policy to your existing role
iam_client = boto3.client('iam')
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess"
)
print(f"Attached AdministratorAccess policy to role: {role}")

input_data_s3_uri = "s3://blue-blood-data/final_df.csv"

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "synthetic_data.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))



sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/ubaid/Library/Application Support/sagemaker/config.yaml


Attached AdministratorAccess policy to role: arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928


In [2]:
df.head()

Unnamed: 0,prescription_dose_val_rx,prescription_dose_unit_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,...,P118,P119,P120,P121,P122,P123,P124,P125,P126,P127
0,0.005328,4,0.640623,0.44423,0.142852,3.6e-05,0.454737,0.114972,-0.008645,-0.009179,...,-0.202378,0.511646,-0.493158,-0.770388,-0.135687,-0.53341,-0.800806,-2.897656,0.671631,-0.349779
1,0.005181,3,0.587083,0.219048,0.049375,0.002394,0.463701,0.391629,0.012237,-0.004657,...,-3.433339,-0.893766,1.843868,-0.289809,1.505239,-0.003083,-0.0357,-1.704398,1.108773,0.253392
2,0.003282,2,0.687024,0.39286,0.126544,3.7e-05,0.531376,0.110763,-0.002508,-0.000985,...,1.299199,-0.256489,1.052623,0.269009,-0.114862,-0.109867,-0.953661,1.074243,0.97876,-0.476479
3,-0.003511,10,0.615906,0.533115,0.151793,-0.00124,0.48881,0.516084,-0.004123,0.013007,...,-0.016925,-0.723175,-0.679679,0.046822,-0.200794,-0.25874,-2.700658,0.824595,0.128626,1.317439
4,0.008858,3,0.664016,0.14003,0.120467,0.003667,0.476179,0.204719,0.003625,0.006369,...,-0.956035,-1.675039,0.17992,-1.134438,0.575927,0.467633,-0.63817,-0.052155,0.88093,-0.600562


In [3]:
def get_presc_cols(df):
    presc_cols = []

    for col in df.columns:
        # check if column starts with 'P'
        if col.startswith('P'):
            presc_cols.append(col)

    presc_cols.append('prescription_dose_val_rx')
    presc_cols.append('prescription_dose_unit_rx')

    return presc_cols



In [4]:
# function that gets the prescription array with the proper format
def get_presc_input(df):
    presc_cols = get_presc_cols(df)
    
    prescriptions = []
        
    # Iterate through rows of the DataFrame
    for _, row in df.iterrows():
        # Extract values from each row
        presc = np.array(row[presc_cols].values)
        prescriptions.append(presc)
    
    prescriptions = np.array(prescriptions)
    print(prescriptions.shape)

    return prescriptions

In [5]:
def add_padding(pre_treatment, post_treatment):
    # Compute the number of zeros to pad (130 - current length)
    pad_width = 130 - pre_treatment.shape[0]
    padded_pre_treatment = np.pad(pre_treatment, (0, pad_width), mode='constant')
    pad_width = 130 - post_treatment.shape[0]
    padded_post_treatment = np.pad(post_treatment, (0, pad_width), mode='constant')
    
    return padded_pre_treatment, padded_post_treatment

In [6]:
def prepare_training_data(df):
    presc_cols = get_presc_cols(df)
    X_train_list = []
    y_train_list = []

    # for each row in the df
    for _, row in df.iterrows():
        # Extract pre_treatment and post_treatment from the current row
        pre_cols = [col for col in df.columns if col.startswith('pre_')]
        post_cols = [col for col in df.columns if col.startswith('post_')]
        
        # Get values for the current row
        pre_treatment = np.array(row[pre_cols].values)
        post_treatment = np.array(row[post_cols].values)
        
        # Get prescription data for current row (assuming this is already defined elsewhere)
        prescriptions = np.array(row[presc_cols].values)
        
        # Add padding to the inputs
        padded_pre_treatment, padded_post_treatment = add_padding(pre_treatment, post_treatment)
        
        # Create the full sequence (1 patient, 3 time steps, 130 features)
        X = np.array([[
            padded_pre_treatment,     # Time Step 1: Pre-Treatment
            prescriptions,            # Time Step 2: Prescription
            padded_post_treatment     # Time Step 3: Post-Treatment
        ]])
        
        y = X[:, -1, :]  # Target is the last time step (Post-Treatment)
        
        X_train_list.append(X[0])
        y_train_list.append(y[0])
    
    return np.array(X_train_list), np.array(y_train_list)


In [7]:
from scipy.stats import chi2_contingency
from scipy.stats import skew, kurtosis

# Function to calculate mean
def calculate_mean(df1, df2):
    return df1.mean(), df2.mean()

# Function to calculate median
def calculate_median(df1, df2):
    return df1.median(), df2.median()

# Function to calculate standard deviation
def calculate_std(df1, df2):
    return df1.std(), df2.std()

# Function to calculate skewness
def calculate_skewness(df1, df2):
    return df1.apply(lambda x: skew(x, nan_policy='omit')), df2.apply(lambda x: skew(x, nan_policy='omit'))

# Function to calculate kurtosis
def calculate_kurtosis(df1, df2):
    return df1.apply(lambda x: kurtosis(x, nan_policy='omit')), df2.apply(lambda x: kurtosis(x, nan_policy='omit'))

# Function to calculate Chi-Square Test for categorical data
def calculate_chi_square(df1, df2, cat_column1, cat_column2):
    # Create contingency tables for both DataFrames
    contingency_table1 = pd.crosstab(df1[cat_column1], df1[cat_column2])
    contingency_table2 = pd.crosstab(df2[cat_column1], df2[cat_column2])
    
    # Perform Chi-Square test for both
    chi2_stat1, p_value1, dof1, expected1 = chi2_contingency(contingency_table1)
    chi2_stat2, p_value2, dof2, expected2 = chi2_contingency(contingency_table2)
    
    # Return the results
    return {
        "Chi2 Statistic (df)": chi2_stat1,
        "P-value (df)": p_value1,
        "Degrees of Freedom (df)": dof1,
        "Expected Frequencies (df)": expected1,
        "Chi2 Statistic (new df)": chi2_stat2,
        "P-value (new df)": p_value2,
        "Degrees of Freedom (new df)": dof2,
        "Expected Frequencies (new df)": expected2
    }

In [8]:
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping

# Update train_model function to use these new functions
def train_model(df, model, epochs=10, job_name=None):
    X, y = prepare_training_data(df)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    early_stop = EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True
    )

    history = model.fit(
        X_train, y_train, 
        epochs=epochs, 
        batch_size=1, 
        validation_data=(X_val, y_val),
        callbacks=[early_stop],
    )

    y_pred = model.predict(X_val)
    
    print(f"History: {history.history}")
    
    return history.history


In [9]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, Flatten, Reshape

model = Sequential([
    Bidirectional(LSTM(64, return_sequences=True), input_shape=(3, 130)),
    Dropout(0.2),
    LSTM(64 // 2, return_sequences=False),
    Dropout(0.2),
    Dense(32, activation="relu"),
    Dense(130)
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print the model summary
model.summary()

# Uncomment to train the model when ready
history = train_model(df, model)
print(history)

  super().__init__(**kwargs)


Epoch 1/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.0133 - val_loss: 0.0075
Epoch 2/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.0067 - val_loss: 0.0041
Epoch 3/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0040 - val_loss: 0.0032
Epoch 4/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0031 - val_loss: 0.0024
Epoch 5/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0026 - val_loss: 0.0020
Epoch 6/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0024 - val_loss: 0.0017
Epoch 7/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0021 - val_loss: 0.0016
Epoch 8/10
[1m1600/1600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 0.0021 - val_loss: 0.0014
Epoch 9/10
[1m1600/1600