In [36]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO

import sagemaker
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "BigQuery Blue Blood DB Data.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))

# Print DataFrame
print(df)

      subject_id   prescription_start           prescription_drug  \
0          10013  2125-10-05T00:00:00           Phenylephrine HCl   
1          10013  2125-10-05T00:00:00                 Vasopressin   
2          10013  2125-10-05T00:00:00        Iso-Osmotic Dextrose   
3          10013  2125-10-05T00:00:00                  Dobutamine   
4          10013  2125-10-05T00:00:00                     Aspirin   
...          ...                  ...                         ...   
2177       44212  2123-11-25T00:00:00                  Vancomycin   
2178       44212  2123-11-25T00:00:00        0.9% Sodium Chloride   
2179       44212  2123-11-25T00:00:00  Piperacillin-Tazobactam Na   
2180       44212  2123-11-25T00:00:00         Prismasate (B22 K4)   
2181       44212  2123-11-25T00:00:00        Iso-Osmotic Dextrose   

     prescription_dose_val_rx prescription_dose_unit_rx        pre_charttime  \
0                          60                        mg  2125-10-04T23:59:00   
1          

In [37]:
cols = df.columns
print(cols)

df.head()

Index(['subject_id', 'prescription_start', 'prescription_drug',
       'prescription_dose_val_rx', 'prescription_dose_unit_rx',
       'pre_charttime', 'pre_ph', 'pre_pco2', 'pre_po2', 'pre_bicarbonate',
       'pre_baseexcess', 'pre_totalco2', 'pre_hematocrit', 'pre_hemoglobin',
       'pre_sodium', 'pre_potassium', 'pre_chloride', 'pre_glucose',
       'pre_lactate', 'pre_so2', 'pre_spo2', 'pre_fio2_chartevents',
       'pre_aado2_calc', 'pre_pao2fio2', 'pre_temperature', 'pre_fio2',
       'pre_aado2', 'pre_carboxyhemoglobin', 'pre_methemoglobin',
       'pre_calcium', 'pre_intubated', 'post_charttime', 'post_ph',
       'post_pco2', 'post_po2', 'post_bicarbonate', 'post_baseexcess',
       'post_totalco2', 'post_hematocrit', 'post_hemoglobin', 'post_sodium',
       'post_potassium', 'post_chloride', 'post_glucose', 'post_lactate',
       'post_so2', 'post_spo2', 'post_fio2_chartevents', 'post_aado2_calc',
       'post_pao2fio2', 'post_temperature', 'post_fio2', 'post_aado2',
      

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,60,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,100,UNIT,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,50,ml,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,250,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,325,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,


In [38]:
#PREPROCESSING --> normalizing unit labels + one hot encoding

units = df['prescription_dose_unit_rx'].to_list()

#Normalize the units
units = [unit.lower() for unit in units]
print("All units in sample DF:")
print(units)

set_of_units = set()
for unit in units:
    if unit not in set_of_units:
        set_of_units.add(unit)
print("Set of UNITS: ")
print(set_of_units)

#Mapping for One Hot Encoding
label_encoding = {}
mapping = 0
for elem in set_of_units:
    label_encoding[elem] = mapping
    mapping += 1
print(label_encoding)

All units in sample DF:
['mg', 'unit', 'ml', 'mg', 'mg', 'gm', 'mg', 'mg', 'mg', 'mg', 'ml', 'mg', 'ml', 'neb', 'mg', 'mg', 'unit', 'gm', 'mg', 'mg', 'ml', 'mg', 'ml', 'ml', 'gm', 'mg', 'ml', 'ml', 'mg', 'meq', 'mg', 'mg', 'meq', 'mg', 'mg', 'appl', 'ml', 'meq', 'ml', 'meq', 'ml', 'mg', 'mg', 'unit', 'mg', 'gm', 'unit', 'mg', 'mg', 'meq', 'gm', 'cap', 'mg', 'tab', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'gm', 'tab', 'mg', 'gm', 'mg', 'mg', 'unit', 'gm', 'mg', 'ml', 'mg', 'meq', 'mg', 'mg', 'mg', 'mg', 'mg', 'ml', 'mg', 'ml', 'mg', 'ml', 'mg', 'mg', 'mg', 'mg', 'ml', 'mg', 'ml', 'mg', 'mg', 'mg', 'ml', 'mg', 'ml', 'mg', 'ml', 'ml', 'ml', 'ml', 'ml', 'tab', 'gm', 'mcg', 'mg', 'unit', 'drop', 'mg', 'gm', 'ml', 'puff', 'meq', 'ml', 'mg', 'mcg', 'mg', 'puff', 'unit', 'ml', 'mg', 'syr', 'mg', 'ml', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'ml', 'mcg', 'meq', 'mg', 'mg', 'ml', 'ml', 'ml', 'ml', 'ml', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'mg', 'ml', 'ml', 'ml', 'mg', 'mg', 'mg', 'unit', 'meq

In [39]:
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].str.lower().map(label_encoding)

In [40]:
df.head(10)

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,60,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,100,6,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,50,19,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,250,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,325,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
5,10013,2125-10-05T00:00:00,Magnesium Sulfate,2,15,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
6,10013,2125-10-05T00:00:00,Morphine Sulfate,100,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
7,10013,2125-10-05T00:00:00,Pantoprazole Sodium,40,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
8,10013,2125-10-05T00:00:00,Furosemide,20,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
9,10013,2125-10-05T00:00:00,Azithromycin,250,24,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,


In [41]:
#BERT FOR VECTOR EMBEDDINGS!
import tensorflow_hub as hub
import tensorflow_text as text

preprocess_url = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-10-h-128-a-2/2"

#BERT Preprocessor
bert_preprocess_model = hub.KerasLayer(preprocess_url)
# Encoder layer
bert_encoder = hub.KerasLayer(encoder_url, trainable=True)

In [42]:
from sklearn.preprocessing import StandardScaler
import umap

# Iterate through the prescription_drug col
rx_vector_embeddings = []
for individual_rx in df['prescription_drug']:
    # print(type(individual_rx))
    preprocessed_rx_text = bert_preprocess_model([str(individual_rx)])

    #Acts like a function, pass preprocessed input as a param
    bert_results = bert_encoder(preprocessed_rx_text)

    #Reverse the process above --> embeddings are as a list of tensors
    #get last encoder layer's tensor --> most abstract
    encoder_output = bert_results["encoder_outputs"][-1]

    # print(f"Shape of raw encoder output for {individual_rx}: {encoder_output.shape}")

    # Convert the TensorFlow tensor to a NumPy array
    encoder_output_np = encoder_output.numpy()

    # Flatten the NumPy array
    encoder_output_flat = encoder_output_np.flatten()

    # print(f"Shape of flattened encoder output for {individual_rx}: {encoder_output_flat.shape}")
    rx_vector_embeddings.append(encoder_output_flat)

# Convert the list of flattened embeddings to a NumPy array
rx_vector_embeddings_array = np.array(rx_vector_embeddings)

print(f"Shape of rx_vector_embeddings_array before UMAP: {rx_vector_embeddings_array.shape}")

# Apply UMAP
n_components = 128  # Or whatever dimension you want
reducer = umap.UMAP(n_components=n_components, random_state=42) # add random state for reproducibility
reduced_embeddings_umap = reducer.fit_transform(rx_vector_embeddings_array)

print(f"Shape of reduced embeddings after UMAP: {reduced_embeddings_umap.shape}")

# Normalize/standardize (optional but recommended)
scaler = StandardScaler()
normalized_reduced_embeddings_umap = scaler.fit_transform(reduced_embeddings_umap)

print(f"Shape of normalized reduced embeddings: {normalized_reduced_embeddings_umap.shape}")

Shape of rx_vector_embeddings_array before UMAP: (2182, 16384)


  warn(


Shape of reduced embeddings after UMAP: (2182, 128)
Shape of normalized reduced embeddings: (2182, 128)


In [43]:
df.insert(df.columns.get_loc("prescription_drug") + 1, "prescription_rx_embeddings", list(normalized_reduced_embeddings_umap))


In [44]:
df

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,"[0.35185558, 0.12351961, -0.12304356, -0.33200...",60,24,2125-10-04T23:59:00,7.30,63.0,60.0,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,"[0.45182744, 0.3218944, -0.5210766, 0.3155888,...",100,6,2125-10-04T23:59:00,7.30,63.0,60.0,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,"[0.45976555, 0.19232908, -0.5738213, 1.3497484...",50,19,2125-10-04T23:59:00,7.30,63.0,60.0,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,"[0.62663776, 0.26167056, -0.024068436, 0.16359...",250,24,2125-10-04T23:59:00,7.30,63.0,60.0,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,"[0.41542324, -0.12879308, -0.20188333, 0.05329...",325,24,2125-10-04T23:59:00,7.30,63.0,60.0,...,60.000002,231.550017,191.666659,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,44212,2123-11-25T00:00:00,Vancomycin,"[0.35067722, 0.1657952, -0.119397685, 0.529637...",500,24,2123-11-24T21:56:00,7.24,48.0,76.0,...,80.000000,343.900000,198.750000,,,,,,,
2178,44212,2123-11-25T00:00:00,0.9% Sodium Chloride,"[-1.6720766, 0.06268573, -0.3970921, -3.601229...",100,19,2123-11-24T21:56:00,7.24,48.0,76.0,...,80.000000,343.900000,198.750000,,,,,,,
2179,44212,2123-11-25T00:00:00,Piperacillin-Tazobactam Na,"[0.34622693, 0.13159652, -0.39089522, 0.683583...",2.25,12,2123-11-24T21:56:00,7.24,48.0,76.0,...,80.000000,343.900000,198.750000,,,,,,,
2180,44212,2123-11-25T00:00:00,Prismasate (B22 K4),"[0.44693142, 0.00798898, -0.14514387, 1.489820...",5000,19,2123-11-24T21:56:00,7.24,48.0,76.0,...,80.000000,343.900000,198.750000,,,,,,,
