In [1]:
import boto3
import pandas as pd
import numpy as np
from io import StringIO

# Define your bucket name and file key (file path in S3)
BUCKET_NAME = "blue-blood-data"
FILE_KEY = "BigQuery Blue Blood DB Data.csv"  # Change to your actual file path in S3

# Create an S3 client
s3 = boto3.client("s3")

# Fetch the file from S3
response = s3.get_object(Bucket=BUCKET_NAME, Key=FILE_KEY)

# Read the CSV file into a pandas DataFrame
csv_content = response["Body"].read().decode("utf-8")
df = pd.read_csv(StringIO(csv_content))

# Print DataFrame
df.head()

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,60,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
1,10013,2125-10-05T00:00:00,Vasopressin,100,UNIT,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,50,ml,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
3,10013,2125-10-05T00:00:00,Dobutamine,250,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,
4,10013,2125-10-05T00:00:00,Aspirin,325,mg,2125-10-04T23:59:00,7.3,63.0,60.0,,...,60.000002,231.550017,191.666659,,,,,,,


In [2]:
# convert prescription_dose_val_rx to numeric
print(df['prescription_dose_val_rx'].dtype)
df['prescription_dose_val_rx'] = pd.to_numeric(df['prescription_dose_val_rx'], errors='coerce')
print(df['prescription_dose_val_rx'].dtype)

object
float64


In [3]:
# get numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
numeric_cols = numeric_cols[1:]  # exclude the first column which is the patient_id

# fill null values in numeric columns with -200
df[numeric_cols] = df[numeric_cols].fillna(-200)
df[numeric_cols].head()

Unnamed: 0,prescription_dose_val_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,pre_sodium,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,60.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
1,100.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
2,50.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
3,250.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
4,325.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0


In [4]:
filtered_df = df[numeric_cols]

filtered_df.head()

Unnamed: 0,prescription_dose_val_rx,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,pre_baseexcess,pre_totalco2,pre_hematocrit,pre_hemoglobin,pre_sodium,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,60.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
1,100.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
2,50.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
3,250.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0
4,325.0,7.3,63.0,60.0,-200.0,2.0,32.0,-200.0,-200.0,-200.0,...,60.000002,231.550017,191.666659,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

ndArray = scaler.fit_transform(filtered_df)

df_scaled = pd.DataFrame(ndArray, columns = filtered_df.columns)

df[numeric_cols] = df_scaled

df.head()

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,0.010317,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,0.011905,UNIT,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,0.009921,ml,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,0.017857,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,0.020833,mg,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# standardize units by converting to lowercase
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].str.lower()

# Standardize some redundant units
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].replace({
    'g': 'gm',
    'mcg/h': 'mcg',
    'mcg/hr': 'mcg',
    'mcg/hour': 'mcg',
    'puff': 'unit',
    'syr': 'unit',
    'puff': 'unit',
    'pkt': 'unit',
    'tab': 'unit',
    'vial': 'unit',
    'bag': 'unit',
    'drop': 'unit',
    'inh': 'unit',
    'cap': 'unit',
})

units_set = set(df['prescription_dose_unit_rx'])

print(len(units_set))
print(units_set)

#Mapping for Ordinal Encoding
label_encoding = {}

mapping = 0

for elem in units_set:
    label_encoding[elem] = mapping
    mapping += 1

print(label_encoding)

15
{'mg', 'unit', 'loz', 'in', 'meq', 'neb', 'mmol', 'ptch', 'gm', 'enema', 'appl', 'mcg', 'troc', 'amp', 'ml'}
{'mg': 0, 'unit': 1, 'loz': 2, 'in': 3, 'meq': 4, 'neb': 5, 'mmol': 6, 'ptch': 7, 'gm': 8, 'enema': 9, 'appl': 10, 'mcg': 11, 'troc': 12, 'amp': 13, 'ml': 14}


In [7]:
df['prescription_dose_unit_rx'] = df['prescription_dose_unit_rx'].str.lower().map(label_encoding)

In [8]:
df.head()

Unnamed: 0,subject_id,prescription_start,prescription_drug,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,pre_bicarbonate,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,0.010317,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,0.011905,1,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,0.009921,14,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,0.017857,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,0.020833,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,0.0,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
import tensorflow_hub as hub
import tensorflow_text as text

preprocess_url = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
encoder_url = "https://www.kaggle.com/models/tensorflow/bert/TensorFlow2/bert-en-uncased-l-12-h-128-a-2/2"

# initialize the bert model for preprocessing
bert_preprocess_model = hub.KerasLayer(preprocess_url)

# initialize the bert model for encoding
bert_model = hub.KerasLayer(encoder_url)

In [10]:
# preprocess prescription_drug column using bert
presc_list = df['prescription_drug'].tolist()
text_preprocessed = bert_preprocess_model(presc_list)
text_preprocessed.keys()

dict_keys(['input_type_ids', 'input_mask', 'input_word_ids'])

In [11]:
text_preprocessed['input_mask']

<tf.Tensor: shape=(2182, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>

In [12]:
# encode the preprocessed text
bert_results = bert_model(text_preprocessed)
bert_results.keys()

dict_keys(['pooled_output', 'encoder_outputs', 'default', 'sequence_output'])

In [13]:
bert_results['encoder_outputs']

[<tf.Tensor: shape=(2182, 128, 128), dtype=float32, numpy=
 array([[[-0.03468557,  0.22849199, -0.29577625, ..., -0.26350754,
          -0.4811728 ,  1.0460556 ],
         [ 1.0766841 , -0.07381418, -1.2135781 , ..., -0.9146468 ,
          -1.0113329 ,  0.9478685 ],
         [ 1.5926559 ,  0.6435773 , -0.6780475 , ..., -0.5222764 ,
          -0.485065  ,  1.3186146 ],
         ...,
         [-0.7937397 ,  0.4682079 , -1.2585518 , ..., -0.792781  ,
          -0.33005115,  0.32805738],
         [-0.6651609 ,  0.21233012, -1.1617501 , ..., -1.103208  ,
          -0.0560322 ,  0.3251611 ],
         [-0.7515336 ,  0.09815576, -1.0182279 , ..., -1.1122723 ,
          -0.11814952,  0.3653853 ]],
 
        [[ 0.09850979, -0.17477775, -0.21922813, ..., -0.208524  ,
          -0.18891326,  1.0060207 ],
         [ 1.1623632 , -0.12712122, -1.8168228 , ..., -0.4215275 ,
           0.8916415 ,  0.88549805],
         [-0.1249817 ,  0.56517255, -1.0744267 , ..., -2.5206535 ,
           0.01011104,  1

In [14]:
import torch
import torch.nn as nn

# Create an AdaptiveAvgPool2d layer to pool to (height=1, width=124)
adaptive_pool = nn.AdaptiveAvgPool2d((1, 124))

# Convert encoder output to a pytorch tensor
encoder_output = torch.tensor(bert_results['encoder_outputs'][-1].numpy())

x = encoder_output.unsqueeze(1) 

# Apply adaptive pooling
pooled_output = adaptive_pool(x)

# Squeeze the extra dimension 
pooled_output = pooled_output.squeeze(2)

print("Pooled Output Shape:", pooled_output.shape)



Pooled Output Shape: torch.Size([2182, 1, 124])


In [15]:
final_output = pooled_output.squeeze(1)  # Now shape is (2182, 124)

# Convert the tensor to a NumPy array
vectors_np = final_output.cpu().detach().numpy()

# insert the embeddings into the df
df.insert(df.columns.get_loc('prescription_drug') + 1, 'presc_embedding', [vector for vector in vectors_np])

df.head()

Unnamed: 0,subject_id,prescription_start,prescription_drug,presc_embedding,prescription_dose_val_rx,prescription_dose_unit_rx,pre_charttime,pre_ph,pre_pco2,pre_po2,...,post_fio2_chartevents,post_aado2_calc,post_pao2fio2,post_temperature,post_fio2,post_aado2,post_carboxyhemoglobin,post_methemoglobin,post_calcium,post_intubated
0,10013,2125-10-05T00:00:00,Phenylephrine HCl,"[0.89859825, 0.5929598, -0.103197895, -0.27054...",0.010317,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10013,2125-10-05T00:00:00,Vasopressin,"[0.17285958, 0.29352757, -0.00074351765, -0.17...",0.011905,1,2125-10-04T23:59:00,0.5,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10013,2125-10-05T00:00:00,Iso-Osmotic Dextrose,"[0.95027816, 0.3103889, -0.38511172, -0.291505...",0.009921,14,2125-10-04T23:59:00,0.5,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10013,2125-10-05T00:00:00,Dobutamine,"[0.27488533, 0.40669948, 0.12023548, -0.101919...",0.017857,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10013,2125-10-05T00:00:00,Aspirin,"[0.20339411, 0.47839478, 0.05848709, -0.107378...",0.020833,0,2125-10-04T23:59:00,0.5,0.741935,0.023018,...,0.866667,0.527567,0.548553,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.drop('prescription_drug', axis=1, inplace=True)

In [18]:
df.columns

Index(['subject_id', 'prescription_start', 'presc_embedding',
       'prescription_dose_val_rx', 'prescription_dose_unit_rx',
       'pre_charttime', 'pre_ph', 'pre_pco2', 'pre_po2', 'pre_bicarbonate',
       'pre_baseexcess', 'pre_totalco2', 'pre_hematocrit', 'pre_hemoglobin',
       'pre_sodium', 'pre_potassium', 'pre_chloride', 'pre_glucose',
       'pre_lactate', 'pre_so2', 'pre_spo2', 'pre_fio2_chartevents',
       'pre_aado2_calc', 'pre_pao2fio2', 'pre_temperature', 'pre_fio2',
       'pre_aado2', 'pre_carboxyhemoglobin', 'pre_methemoglobin',
       'pre_calcium', 'pre_intubated', 'post_charttime', 'post_ph',
       'post_pco2', 'post_po2', 'post_bicarbonate', 'post_baseexcess',
       'post_totalco2', 'post_hematocrit', 'post_hemoglobin', 'post_sodium',
       'post_potassium', 'post_chloride', 'post_glucose', 'post_lactate',
       'post_so2', 'post_spo2', 'post_fio2_chartevents', 'post_aado2_calc',
       'post_pao2fio2', 'post_temperature', 'post_fio2', 'post_aado2',
       '

In [19]:
df.to_csv('processed_data.csv', index=False)