In [1]:
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from __future__ import print_function
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.cluster import homogeneity_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.cm as cm
%matplotlib inline

import pandas as pd
# Load the cleaned dataset
df_filled = pd.read_csv('/content/drive/MyDrive/smaller_sepsis_data.csv')

# Create a new dataframe with unique patient IDs and their sepsis labels
df_one = df_filled.groupby('Identifier')['SepsisLabel'].max().reset_index()

# Rename the columns for clarity
df_one.columns = ['Identifier', 'SepsisLabel']

import pandas as pd

def create_narrative_paragraph(patient_id, patient_data):
    """
    Generate a narrative paragraph for a specific patient.
    """
    narrative_segments = []
    for _, row in patient_data.iterrows():
        # Extract required fields from the row
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph

# Generate df_two
def generate_df_two(df_filled):
    """
    Generate df_two containing narrative_paragraph and PatientID for each patient.
    """
    df_two_data = []

    # Group the dataframe by 'Identifier' (PatientID)
    grouped = df_filled.groupby('Identifier')

    for patient_id, patient_data in grouped:
        # Create a narrative paragraph for each patient
        narrative_paragraph = create_narrative_paragraph(patient_id, patient_data)

        # Append the result to the list
        df_two_data.append({'Identifier': patient_id, 'Prompts': narrative_paragraph})

    # Convert the list of dictionaries to a dataframe
    df_two = pd.DataFrame(df_two_data)

    return df_two

# Use the function to create df_two
df_two = generate_df_two(df_filled)

# Merge the two dataframes on the 'Identifier' column
df_combined = pd.merge(df_one, df_two, left_on='Identifier', right_on='Identifier', how='inner')

# Convert the dataset into a list of prompts
# Convert 'Prompts' column to a list
prompts = df_combined['Prompts'].tolist()
# Convert 'SepsisLabel' column to a list
labels = df_combined['SepsisLabel'].tolist()


In [2]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

# Load the GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token  # Add padding token
tokenizer.padding_side = "right"

# Load the GPT-2 model for TensorFlow
model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")

# Set the model's padding token ID
model.config.pad_token_id = tokenizer.pad_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [5]:
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Tokenize the data
tokenized_inputs = tokenizer(prompts, truncation=True, padding=True, max_length=512, return_tensors="tf")
input_ids = tokenized_inputs["input_ids"]
attention_mask = tokenized_inputs["attention_mask"]

# Convert TensorFlow tensors to NumPy arrays
input_ids_np = input_ids.numpy()
attention_mask_np = attention_mask.numpy()
labels_np = tf.convert_to_tensor(labels).numpy()

# Use train_test_split with NumPy arrays
X_train, X_val, y_train, y_val = train_test_split(input_ids_np, labels_np, test_size=0.2, random_state=42)

# Convert back to TensorFlow tensors for further processing
X_train = tf.convert_to_tensor(X_train)
X_val = tf.convert_to_tensor(X_val)
y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": X_train, "attention_mask": attention_mask[:len(y_train)], "labels": X_train},
    y_train
)).batch(4)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": X_val, "attention_mask": attention_mask[len(y_train):], "labels": X_val},
    y_val
)).batch(4)


In [6]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer)

# Fine-tune the model
model.fit(train_dataset, validation_data=val_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x7e6b596fe740>

In [7]:
import pandas as pd

def create_narrative_for_single_patient(csv_file_path):
    """
    Create a narrative prompt for a single patient from a CSV file.

    Parameters:
        csv_file_path (str): Path to the CSV file containing the patient's data.

    Returns:
        str: Narrative paragraph describing the patient's data.
    """
    # Load the patient's data
    patient_data = pd.read_csv(csv_file_path)

    # Extract the patient identifier (assuming there's an 'Identifier' column)
    patient_id = patient_data['Identifier'].iloc[0]

    # Initialize a list to hold the narrative segments
    narrative_segments = []

    # Loop through the rows and create narrative sentences
    for _, row in patient_data.iterrows():
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph



In [12]:
new_pt= r'/content/sample_data/OnePatient.csv'

In [13]:
new_prompt = create_narrative_for_single_patient(new_pt)

In [14]:
# Tokenize the new input
encoded_input = tokenizer(new_prompt, truncation=True, padding=True, max_length=512, return_tensors="tf")
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]


In [15]:
# Example improved function with refined prompt
def generate_zero_shot_recommendation(promptt, model, tokenizer):

    # Tokenize and generate
    input_data = tokenizer(promptt, truncation=True, padding=True, max_length=512, return_tensors="tf")
    outputs = model.generate(
        input_ids=input_data["input_ids"],
        attention_mask=input_data["attention_mask"],
        max_new_tokens=100,
        temperature=0.7,  # Slightly more random
        top_k=50,
        top_p=0.95  # Broadens the sampling scope
    )

    # Decode and validate output
    recommendation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return recommendation

In [16]:
# Test the updated function
recommendation = generate_zero_shot_recommendation(new_prompt, model, tokenizer)
print("Generated Recommendation:\n", recommendation)



Generated Recommendation:
 Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0. At hour 73.0, the patient's vitals were as follows: Heart Rate (HR) was 85.0, Oxygen Saturation (O2Sat) was 96.0, Systolic Blood Pressure (SBP) was 120.0, Mean Arterial Pressure (MAP) was 79.0, and Respiratory Rate (Resp) was 18.0. At hour 74.0, the patient's vitals were as follows: Heart Rate (HR) was 95.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 114.0, Mean Arterial Pressure (MAP) was 75.0, and Respiratory Rate (Resp) was 18.0. At hour 75.0, the patient's vitals were as follows: Heart Rate (HR) was 101.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 113.0, Mean Arterial Pressure (MAP) was 78.0, and Respiratory Rate (Resp) was 18.0