In [1]:
from io import StringIO
import sagemaker
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)

role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"
role_name = role.split('/')[-1]  # Extract just the role name from the ARN

# Attach AdministratorAccess policy to your existing role
iam_client = boto3.client('iam')
iam_client.attach_role_policy(
    RoleName=role_name,
    PolicyArn="arn:aws:iam::aws:policy/AdministratorAccess"
)
print(f"Attached AdministratorAccess policy to role: {role}")

input_data_s3_uri = "s3://blue-blood-data/final_df.csv"

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "final_df.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))



sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/ubaid/Library/Application Support/sagemaker/config.yaml


Attached AdministratorAccess policy to role: arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928


In [2]:
df.head()

Unnamed: 0,subject_id,prescription_start,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,[ 3.5185558e-01 1.2351961e-01 -1.2304356e-01 ...,0.010317,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,[ 0.45182744 0.3218944 -0.5210766 0.315588...,0.011905,4,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,[ 4.5976555e-01 1.9232908e-01 -5.7382131e-01 ...,0.009921,2,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,[ 6.26637757e-01 2.61670560e-01 -2.40684357e-...,0.017857,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,[ 4.15423244e-01 -1.28793076e-01 -2.01883331e-...,0.020833,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
import pandas as pd
import numpy as np
import re

def clean_and_convert(x):
    if isinstance(x, np.ndarray):  # If already a NumPy array, return as-is
        return x
    if isinstance(x, str):  # Only process strings
        try:
            x = re.sub(r'[\[\]]', '', x)  # Remove square brackets
            cleaned = re.sub(r'\s+', ' ', x.strip())  # Remove extra spaces
            return np.array([float(i) for i in cleaned.split(' ')])  # Convert to NumPy array
        except Exception as e:
            return x  # Return original value in case of error
    return x  # If NaN or unexpected type, return as-is

# Check the values before applying the function
print(df['prescription_rx_embeddings'].head())

# Apply the function to the 'prescription_rx_embeddings' column
df['prescription_rx_embeddings'] = df['prescription_rx_embeddings'].apply(clean_and_convert)

# Check the result
df['prescription_rx_embeddings'][0]

0    [ 3.5185558e-01  1.2351961e-01 -1.2304356e-01 ...
1    [ 0.45182744  0.3218944  -0.5210766   0.315588...
2    [ 4.5976555e-01  1.9232908e-01 -5.7382131e-01 ...
3    [ 6.26637757e-01  2.61670560e-01 -2.40684357e-...
4    [ 4.15423244e-01 -1.28793076e-01 -2.01883331e-...
Name: prescription_rx_embeddings, dtype: object


array([ 3.5185558e-01,  1.2351961e-01, -1.2304356e-01, -3.3200896e-01,
       -1.0614018e+00,  1.9121327e-01,  2.7952591e-01,  1.0901182e-02,
       -4.5522165e-01, -6.9268858e-01,  1.5272896e-01, -6.2518787e-01,
       -5.0267065e-01,  2.3799901e-01,  3.2225102e-01,  3.5973153e-01,
       -8.7883368e-02, -6.6698366e-01,  1.2195622e+00,  3.2569757e-01,
       -9.7169526e-02, -9.6203506e-01,  7.8181475e-01,  3.1520061e+00,
        1.0999583e+00,  8.8330621e-01, -1.3076260e+00, -1.1109836e+00,
       -2.1281254e+00,  5.1991540e-01, -1.2142467e+00, -9.5872365e-02,
       -2.3115242e-01,  9.2357832e-01,  7.1854466e-01, -4.1259676e-01,
       -7.4190390e-01,  1.6999608e+00,  4.8031932e-01,  8.1066114e-01,
       -1.0656835e+00, -2.8141283e-03, -5.6771725e-01,  4.5298275e-01,
        2.2276033e-01,  4.5889392e-02, -2.4509761e-01, -1.4576931e-01,
        6.5086001e-01,  1.1289541e-01,  4.4198048e-01, -8.2376510e-01,
       -4.1519284e-01,  4.0492430e-01,  1.8494433e-01, -5.1815975e-01,
      

In [4]:
# function that gets the unique pairs of subject_id and prescription_start
def get_unique_pairs(df):
    subject_ids = df['subject_id'].unique()
    patient_date_pairs = {id: set() for id in subject_ids}

    for subj in subject_ids:
        df[df['subject_id'] == subj]['prescription_start'].apply(lambda x: patient_date_pairs[subj].add(x))
        # convert set to list
        patient_date_pairs[subj] = list(patient_date_pairs[subj])
    return patient_date_pairs

In [5]:
# function that gets the prescription array with the proper format
def get_presc_input(df):
    prescriptions = []
        
    # Iterate through rows of the DataFrame
    for _, row in df.iterrows():
        # Extract values from each row
        presc = row['prescription_rx_embeddings']
        dose_val = row['prescription_dose_val_rx']
        dose_unit = row['prescription_dose_unit_rx']
        
        # Concatenate the prescription embedding with the dose value and unit
        combined = np.concatenate((presc, np.array([dose_val, dose_unit])))
        prescriptions.append(combined)
    
    # Convert list to numpy array
    prescriptions = np.array(prescriptions)

    return prescriptions

In [6]:
# function that adds the proper padding to our input arrays
def add_padding(prescriptions, pre_treatment, post_treatment):
    # reshape pre_treatment and post_treatment to be 2D arrays
    pre_treatment = pre_treatment.reshape(1, -1)
    post_treatment = post_treatment.reshape(1, -1)
    
    # Pad or truncate to 20 rows
    if prescriptions.shape[0] < 20:
        # Pad with zeros to reach 20 rows
        prescriptions = np.pad(prescriptions, ((0, 20 - prescriptions.shape[0]), (0, 0)), mode='constant')
    elif prescriptions.shape[0] > 20:
        # Truncate to 20 rows
        prescriptions = prescriptions[:20, :]
    
    # Pad pre_treatment and post_treatment to 20 rows
    pre_treatment = np.pad(pre_treatment, ((0, 19), (0, 0)), mode='constant')  # pad to (20, 25)
    post_treatment = np.pad(post_treatment, ((0, 19), (0, 0)), mode='constant')  # pad to (20, 25)
    
    # Now pad columns to reach 180 features for each
    padded_prescriptions = np.pad(prescriptions, ((0, 0), (0, 0)), mode='constant')
    padded_pre_treatment = np.pad(pre_treatment, ((0, 0), (0, 105)), mode='constant')
    padded_post_treatment = np.pad(post_treatment, ((0, 0), (0, 105)), mode='constant') 
    
    return padded_prescriptions, padded_pre_treatment, padded_post_treatment

In [7]:
def prepare_training_data(df):
    # Clean and convert the DataFrame
    df['prescription_rx_embeddings'] = df['prescription_rx_embeddings'].apply(clean_and_convert)

    # Get unique patient/date pairs
    patient_date_pairs = get_unique_pairs(df)
    
    X_train_list = []
    y_train_list = []
    
    # Iterate through the patient/date pairs
    for patient in patient_date_pairs:
        for date in patient_date_pairs[patient]:
            # Get the data for the current patient/date pair
            patient_data = df[(df['subject_id'] == patient) & (df['prescription_start'] == date)]
            
            if len(patient_data) == 0:
                continue
                
            # Drop unnecessary columns for processing
            processing_data = patient_data.drop(['subject_id', 'prescription_start', 'pre_charttime', 'post_charttime'], axis=1)
            
            # Get the prescription input (2DArray with shape (num_prescriptions, 130))
            prescriptions = get_presc_input(processing_data)
            
            # pre_treatment and post_treatment are 1D arrays
            pre_treatment = np.array(processing_data[[col for col in processing_data.columns if col.startswith('pre_')]].values[0])
            post_treatment = np.array(processing_data[[col for col in processing_data.columns if col.startswith('post_')]].values[0])
            
            # Add padding to the inputs
            padded_prescriptions, padded_pre_treatment, padded_post_treatment = add_padding(prescriptions, pre_treatment, post_treatment)
            
            # Create the full sequence (1 patient, 3 time steps, 180 features)
            X = np.array([[
                padded_pre_treatment,     # Time Step 1: Pre-Treatment
                padded_prescriptions,     # Time Step 2: Prescription
                padded_post_treatment     # Time Step 3: Post-Treatment
            ]])
            
            y = X[:, -1, :]  # Target is the last time step (Post-Treatment)
            
            X_train_list.append(X[0])
            y_train_list.append(y[0])
    
    return np.array(X_train_list), np.array(y_train_list)


In [8]:
from scipy.stats import chi2_contingency
from scipy.stats import skew, kurtosis

# Function to calculate mean
def calculate_mean(df1, df2):
    return df1.mean(), df2.mean()

# Function to calculate median
def calculate_median(df1, df2):
    return df1.median(), df2.median()

# Function to calculate standard deviation
def calculate_std(df1, df2):
    return df1.std(), df2.std()

# Function to calculate skewness
def calculate_skewness(df1, df2):
    return df1.apply(lambda x: skew(x, nan_policy='omit')), df2.apply(lambda x: skew(x, nan_policy='omit'))

# Function to calculate kurtosis
def calculate_kurtosis(df1, df2):
    return df1.apply(lambda x: kurtosis(x, nan_policy='omit')), df2.apply(lambda x: kurtosis(x, nan_policy='omit'))

# Function to calculate Chi-Square Test for categorical data
def calculate_chi_square(df1, df2, cat_column1, cat_column2):
    # Create contingency tables for both DataFrames
    contingency_table1 = pd.crosstab(df1[cat_column1], df1[cat_column2])
    contingency_table2 = pd.crosstab(df2[cat_column1], df2[cat_column2])
    
    # Perform Chi-Square test for both
    chi2_stat1, p_value1, dof1, expected1 = chi2_contingency(contingency_table1)
    chi2_stat2, p_value2, dof2, expected2 = chi2_contingency(contingency_table2)
    
    # Return the results
    return {
        "Chi2 Statistic (df)": chi2_stat1,
        "P-value (df)": p_value1,
        "Degrees of Freedom (df)": dof1,
        "Expected Frequencies (df)": expected1,
        "Chi2 Statistic (new df)": chi2_stat2,
        "P-value (new df)": p_value2,
        "Degrees of Freedom (new df)": dof2,
        "Expected Frequencies (new df)": expected2
    }

# Function to find columns with error % greater than threshold across all statistics
def find_error_range_columns(df1, df2, threshold=0.02):
    range_error_columns = set(df1.columns)
    
    for stat_func in [calculate_mean, calculate_median, calculate_std, calculate_skewness, calculate_kurtosis]:
        stat_df1, stat_df2 = stat_func(df1, df2)
        error_diff = abs((stat_df1 - stat_df2) / stat_df1)
        # Columns where error is >= threshold
        range_error_cols = error_diff[(error_diff >= threshold)].index
        range_error_columns.intersection_update(set(range_error_cols))  # Keep only common range-error columns
    
    # Print results
    if range_error_columns:
        print(f"\nColumns with Error Greater than {threshold} Across All Statistics:")
        for col in sorted(range_error_columns):
            print(f"- {col}")
    else:
        print(f"\nNo columns have error greater than {threshold} across all statistics.")
    
    return range_error_columns

In [9]:
from sklearn.model_selection import train_test_split

# Update train_model function to use these new functions
def train_model(df, model, epochs=10, job_name=None):
    X, y = prepare_training_data(df)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    history = model.fit(
        X_train, y_train, 
        epochs=epochs, 
        batch_size=1, 
        validation_data=(X_val, y_val)
    )

    y_pred = model.predict(X_val)

    for i in range(len(y_pred)):
        df1 = pd.DataFrame(y_val[i])
        df2 = pd.DataFrame(y_pred[i])

        print(f"\nSample {i} Comparison:")
        find_error_range_columns(df1, df2, 0.02)

        # print("Mean Comparison for sample ", i)
        # mean_val, mean_pred = calculate_mean(df1, df2)
        # print(f"y_val Mean: \n{mean_val}")
        # print(f"y_pred Mean: \n{mean_pred}")

        # print("\nMedian Comparison for sample ", i)
        # median_val, median_pred = calculate_median(df1, df2)
        # print(f"y_val Median: \n{median_val}")
        # print(f"y_pred Median: \n{median_pred}")

        # print("\nStandard Deviation Comparison for sample ", i)
        # std_val, std_pred = calculate_std(df1, df2)
        # print(f"y_val Std: \n{std_val}")
        # print(f"y_pred Std: \n{std_pred}")

        # print("\nSkewness Comparison for sample ", i)
        # skew_val, skew_pred = calculate_skewness(df1, df2)
        # print(f"y_val Skewness: \n{skew_val}")
        # print(f"y_pred Skewness: \n{skew_pred}")

        # print("\nKurtosis Comparison for sample ", i)
        # kurt_val, kurt_pred = calculate_kurtosis(df1, df2)
        # print(f"y_val Kurtosis: \n{kurt_val}")
        # print(f"y_pred Kurtosis: \n{kurt_pred}")

        # print("\nChi-Square Test for sample ", i)
        # chi_square_results = calculate_chi_square(df1, df2, 'prescription_dose_val_rx', 'prescription_dose_unit_rx')
        # print(f"Chi-Square Results: {chi_square_results}")
    
    print(f"History: {history.history}")
    
    return history.history


In [10]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, TimeDistributed, Flatten, Reshape

# input shape is (3, 20, 130)
# We flatten each time step, process the 3 time steps with an LSTM,
# and then use a Dense layer to output 20*130 values reshaped to (20, 130)
model = Sequential([
    TimeDistributed(Flatten(), input_shape=(3, 20, 130)),
    LSTM(64, return_sequences=False),
    Dropout(0.2),
    Dense(20 * 130, activation="linear"),
    Reshape((20, 130))
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print the model summary
model.summary()

# Uncomment to train the model when ready
history = train_model(df, model)
print(history)

  super().__init__(**kwargs)


Epoch 1/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0015 - val_loss: 5.4563e-04
Epoch 2/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.8257e-04 - val_loss: 4.6287e-04
Epoch 3/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.6930e-04 - val_loss: 3.8066e-04
Epoch 4/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 3.0300e-04 - val_loss: 3.2204e-04
Epoch 5/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 2.2926e-04 - val_loss: 2.9333e-04
Epoch 6/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.4259e-04 - val_loss: 2.6871e-04
Epoch 7/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 1.4313e-04 - val_loss: 2.6734e-04
Epoch 8/10
[1m224/224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.5154e-04 - v