In [44]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO

import sagemaker
from sagemaker.pytorch import PyTorch
import os

boto_session = boto3.Session(region_name='us-east-1')
sagemaker_session = sagemaker.Session(boto_session=boto_session)
role = "arn:aws:iam::211125439249:role/service-role/AmazonSageMaker-ExecutionRole-20250314T153928"

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "BigQuery Blue Blood DB Data.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))

# Print DataFrame
df

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,60,mg,2125-10-04T23:59:00,7.30,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,100,UNIT,2125-10-04T23:59:00,7.30,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,50,ml,2125-10-04T23:59:00,7.30,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,250,mg,2125-10-04T23:59:00,7.30,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,325,mg,2125-10-04T23:59:00,7.30,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,44212,2123-11-25T00:00:00,Vancomycin,500,mg,2123-11-24T21:56:00,7.24,48.0,76.0,,...,80.000000,343.900000,198.750000,,,,,,,
2178,44212,2123-11-25T00:00:00,0.9% Sodium Chloride,100,mL,2123-11-24T21:56:00,7.24,48.0,76.0,,...,80.000000,343.900000,198.750000,,,,,,,
2179,44212,2123-11-25T00:00:00,Piperacillin-Tazobactam Na,2.25,g,2123-11-24T21:56:00,7.24,48.0,76.0,,...,80.000000,343.900000,198.750000,,,,,,,
2180,44212,2123-11-25T00:00:00,Prismasate (B22 K4),5000,mL,2123-11-24T21:56:00,7.24,48.0,76.0,,...,80.000000,343.900000,198.750000,,,,,,,


## Data Pre-Processing

### Numerical Values (Scaling & Normalization)

In [45]:
cols = df.columns
print(cols)

df.head()

Index(['subject_id', 'prescription_start', 'prescription_drug',
       'prescription_dose_val_rx', 'prescription_dose_unit_rx',
       'pre_charttime', 'pre_ph', 'pre_pco2', 'pre_po2', 'pre_bicarbonate',
       'pre_baseexcess', 'pre_totalco2', 'pre_hematocrit', 'pre_hemoglobin',
       'pre_sodium', 'pre_potassium', 'pre_chloride', 'pre_glucose',
       'pre_lactate', 'pre_so2', 'pre_spo2', 'pre_fio2_chartevents',
       'pre_aado2_calc', 'pre_pao2fio2', 'pre_temperature', 'pre_fio2',
       'pre_aado2', 'pre_carboxyhemoglobin', 'pre_methemoglobin',
       'pre_calcium', 'pre_intubated', 'post_charttime', 'post_ph',
       'post_pco2', 'post_po2', 'post_bicarbonate', 'post_baseexcess',
       'post_totalco2', 'post_hematocrit', 'post_hemoglobin', 'post_sodium',
       'post_potassium', 'post_chloride', 'post_glucose', 'post_lactate',
       'post_so2', 'post_spo2', 'post_fio2_chartevents', 'post_aado2_calc',
       'post_pao2fio2', 'post_temperature', 'post_fio2', 'post_aado2',
      

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,60,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,100,UNIT,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,50,ml,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,250,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,325,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,


In [46]:
# convert prescription_dose_val_rx to numeric
print(df['prescription_dose_val_rx'].dtype)
df['prescription_dose_val_rx'] = pd.to_numeric(df['prescription_dose_val_rx'], errors='coerce')
print(df['prescription_dose_val_rx'].dtype)

# get numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols[1:]  # exclude the first column which is the patient_id

# fill null values in numeric columns with -200
df[numeric_cols] = df[numeric_cols].fillna(-200)
df[numeric_cols].head()

object
float64


Unnamed: 0,prescription_dose_val_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,pre_sodium,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,60.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
1,100.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
2,50.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
3,250.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
4,325.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0


In [47]:
filtered_df = df[numeric_cols]

filtered_df.head()

Unnamed: 0,prescription_dose_val_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,pre_sodium,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,60.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
1,100.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
2,50.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
3,250.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
4,325.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0


In [48]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
ndArray = scaler.fit_transform(filtered_df)
df_scaled = pd.DataFrame(ndArray, columns = filtered_df.columns)
df[numeric_cols] = df_scaled

df.head()

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,0.010317,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,0.011905,UNIT,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,0.009921,ml,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,0.017857,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,0.020833,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Text Values: BERT (Embedded Vectorization) and Label Encoding 

In [49]:
#PREPROCESSING --> normalizing unit labels + one hot encoding
# standardize units by converting to lowercase
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].str.lower()

# Standardize some redundant units
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].replace({
    'g': 'gm',
    'mcg/h': 'mcg',
    'mcg/hr': 'mcg',
    'mcg/hour': 'mcg',
    'puff': 'unit',
    'syr': 'unit',
    'puff': 'unit',
    'pkt': 'unit',
    'tab': 'unit',
    'vial': 'unit',
    'bag': 'unit',
    'drop': 'unit',
    'inh': 'unit',
    'cap': 'unit',
})

units_set = set(df['prescription_dose_unit_rx'])

print(len(units_set))
print(units_set)

#Mapping for Ordinal Encoding
label_encoding = {}
mapping = 0
for elem in units_set:
    label_encoding[elem] = mapping
    mapping += 1

print(label_encoding)
print("Mapping for RX Labels:\n")
print(label_encoding)

15
{'loz', 'amp', 'ml', 'mg', 'unit', 'troc', 'enema', 'in', 'gm', 'meq', 'appl', 'mcg', 'mmol', 'ptch', 'neb'}
{'loz': 0, 'amp': 1, 'ml': 2, 'mg': 3, 'unit': 4, 'troc': 5, 'enema': 6, 'in': 7, 'gm': 8, 'meq': 9, 'appl': 10, 'mcg': 11, 'mmol': 12, 'ptch': 13, 'neb': 14}
Mapping for RX Labels:

{'loz': 0, 'amp': 1, 'ml': 2, 'mg': 3, 'unit': 4, 'troc': 5, 'enema': 6, 'in': 7, 'gm': 8, 'meq': 9, 'appl': 10, 'mcg': 11, 'mmol': 12, 'ptch': 13, 'neb': 14}


In [50]:
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].str.lower().map(label_encoding)

In [51]:
df.head(10)

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,0.010317,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,0.011905,4,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,0.009921,2,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,0.017857,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,0.020833,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10013,2125-10-05T00:00:00,Magnesium Sulfate,0.008016,8,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,10013,2125-10-05T00:00:00,Morphine Sulfate,0.011905,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,10013,2125-10-05T00:00:00,Pantoprazole Sodium,0.009524,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10013,2125-10-05T00:00:00,Furosemide,0.00873,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10013,2125-10-05T00:00:00,Azithromycin,0.017857,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
#BERT FOR VECTOR EMBEDDINGS!
import tensorflow_hub as hub
import tensorflow_text as text

preprocess_url = "https://kaggle.com/models/tensorflow/bert/TensorFlow2/en-uncased-preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-10-h-128-a-2/2"

#BERT Preprocessor
bert_preprocess_model = hub.KerasLayer(preprocess_url)
# Encoder layer
bert_encoder = hub.KerasLayer(encoder_url, trainable=True)

In [53]:
from sklearn.preprocessing import StandardScaler
import umap

# Iterate through the prescription_drug col
rx_vector_embeddings = []
for individual_rx in df['prescription_drug']:
    # print(type(individual_rx))
    preprocessed_rx_text = bert_preprocess_model([str(individual_rx)])

    #Acts like a function, pass preprocessed input as a param
    bert_results = bert_encoder(preprocessed_rx_text)

    #Reverse the process above --> embeddings are as a list of tensors
    #get last encoder layer's tensor --> most abstract
    encoder_output = bert_results["encoder_outputs"][-1]

    # print(f"Shape of raw encoder output for {individual_rx}: {encoder_output.shape}")

    # Convert the TensorFlow tensor to a NumPy array
    encoder_output_np = encoder_output.numpy()

    # Flatten the NumPy array
    encoder_output_flat = encoder_output_np.flatten()

    # print(f"Shape of flattened encoder output for {individual_rx}: {encoder_output_flat.shape}")
    rx_vector_embeddings.append(encoder_output_flat)

# Convert the list of flattened embeddings to a NumPy array
rx_vector_embeddings_array = np.array(rx_vector_embeddings)

print(f"Shape of rx_vector_embeddings_array before UMAP: {rx_vector_embeddings_array.shape}")

# Apply UMAP
n_components = 128  # Or whatever dimension you want
reducer = umap.UMAP(n_components=n_components, random_state=42) # add random state for reproducibility
reduced_embeddings_umap = reducer.fit_transform(rx_vector_embeddings_array)

print(f"Shape of reduced embeddings after UMAP: {reduced_embeddings_umap.shape}")

# Normalize/standardize (optional but recommended)
scaler = StandardScaler()
normalized_reduced_embeddings_umap = scaler.fit_transform(reduced_embeddings_umap)

print(f"Shape of normalized reduced embeddings: {normalized_reduced_embeddings_umap.shape}")

Shape of rx_vector_embeddings_array before UMAP: (2182, 16384)


  warn(


Shape of reduced embeddings after UMAP: (2182, 128)
Shape of normalized reduced embeddings: (2182, 128)


In [54]:
df.insert(df.columns.get_loc("prescription_drug") + 1, "prescription_rx_embeddings", list(normalized_reduced_embeddings_umap))


In [55]:
df

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,"[0.35185558, 0.12351961, -0.12304356, -0.33200...",0.010317,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,"[0.45182744, 0.3218944, -0.5210766, 0.3155888,...",0.011905,4,2125-10-04T23:59:00,0.500,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,"[0.45976555, 0.19232908, -0.5738213, 1.3497484...",0.009921,2,2125-10-04T23:59:00,0.500,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,"[0.62663776, 0.26167056, -0.024068436, 0.16359...",0.017857,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,"[0.41542324, -0.12879308, -0.20188333, 0.05329...",0.020833,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,44212,2123-11-25T00:00:00,Vancomycin,"[0.35067722, 0.1657952, -0.119397685, 0.529637...",0.027778,3,2123-11-24T21:56:00,0.375,0.500000,0.063939,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2178,44212,2123-11-25T00:00:00,0.9% Sodium Chloride,"[-1.6720766, 0.06268573, -0.3970921, -3.601229...",0.011905,2,2123-11-24T21:56:00,0.375,0.500000,0.063939,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2179,44212,2123-11-25T00:00:00,Piperacillin-Tazobactam Na,"[0.34622693, 0.13159652, -0.39089522, 0.683583...",0.008026,8,2123-11-24T21:56:00,0.375,0.500000,0.063939,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2180,44212,2123-11-25T00:00:00,Prismasate (B22 K4),"[0.44693142, 0.00798898, -0.14514387, 1.489820...",0.206349,2,2123-11-24T21:56:00,0.375,0.500000,0.063939,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
df = df.drop('prescription_drug', axis=1)

In [57]:
df

Unnamed: 0,subject_id,prescription_start,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,"[0.35185558, 0.12351961, -0.12304356, -0.33200...",0.010317,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,"[0.45182744, 0.3218944, -0.5210766, 0.3155888,...",0.011905,4,2125-10-04T23:59:00,0.500,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,"[0.45976555, 0.19232908, -0.5738213, 1.3497484...",0.009921,2,2125-10-04T23:59:00,0.500,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,"[0.62663776, 0.26167056, -0.024068436, 0.16359...",0.017857,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,"[0.41542324, -0.12879308, -0.20188333, 0.05329...",0.020833,3,2125-10-04T23:59:00,0.500,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177,44212,2123-11-25T00:00:00,"[0.35067722, 0.1657952, -0.119397685, 0.529637...",0.027778,3,2123-11-24T21:56:00,0.375,0.500000,0.063939,0.0,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2178,44212,2123-11-25T00:00:00,"[-1.6720766, 0.06268573, -0.3970921, -3.601229...",0.011905,2,2123-11-24T21:56:00,0.375,0.500000,0.063939,0.0,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2179,44212,2123-11-25T00:00:00,"[0.34622693, 0.13159652, -0.39089522, 0.683583...",0.008026,8,2123-11-24T21:56:00,0.375,0.500000,0.063939,0.0,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2180,44212,2123-11-25T00:00:00,"[0.44693142, 0.00798898, -0.14514387, 1.489820...",0.206349,2,2123-11-24T21:56:00,0.375,0.500000,0.063939,0.0,...,0.933333,0.664914,0.558473,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
df.head()

Unnamed: 0,subject_id,prescription_start,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,"[0.35185558, 0.12351961, -0.12304356, -0.33200...",0.010317,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,"[0.45182744, 0.3218944, -0.5210766, 0.3155888,...",0.011905,4,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,"[0.45976555, 0.19232908, -0.5738213, 1.3497484...",0.009921,2,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,"[0.62663776, 0.26167056, -0.024068436, 0.16359...",0.017857,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,"[0.41542324, -0.12879308, -0.20188333, 0.05329...",0.020833,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
print(df['prescription_rx_embeddings'].dtype)

object


In [60]:
#Converting all prescription vectors to NumPy arrays

import re

def clean_and_convert(x):
    if isinstance(x, np.ndarray):  # If already a NumPy array, return as-is
        return x
    if isinstance(x, str):  # Only process strings
        try:
            x = re.sub(r'[\[\]]', '', x)  # Remove square brackets
            cleaned = re.sub(r'\s+', ' ', x.strip())  # Remove extra spaces
            return np.array([float(i) for i in cleaned.split(' ')])  # Convert to NumPy array
        except Exception as e:
            print(f"Error processing value: {x}. Error: {e}")
            return x  # Return original value in case of error
    return x  # If NaN or unexpected type, return as-is

# Apply the function
df['prescription_rx_embeddings'] = df['prescription_rx_embeddings'].apply(clean_and_convert)

# Verify the type of the first few elements
print(type(df['prescription_rx_embeddings'].iloc[0]))


<class 'numpy.ndarray'>


In [61]:
df.head()

Unnamed: 0,subject_id,prescription_start,prescription_rx_embeddings,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,"[0.35185558, 0.12351961, -0.12304356, -0.33200...",0.010317,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,"[0.45182744, 0.3218944, -0.5210766, 0.3155888,...",0.011905,4,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,"[0.45976555, 0.19232908, -0.5738213, 1.3497484...",0.009921,2,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,"[0.62663776, 0.26167056, -0.024068436, 0.16359...",0.017857,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,"[0.41542324, -0.12879308, -0.20188333, 0.05329...",0.020833,3,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
df.to_csv("final_df.csv", index=False)