Step 1: Import libraries

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AutoModelForSequenceClassification
%matplotlib inline

Step 2: Import Data and Create a unique ID and sepsis label table

In [2]:
# Load the cleaned dataset
df_filled = pd.read_csv('/content/drive/MyDrive/smaller_sepsis_data.csv')

# Create a new dataframe with unique patient IDs and their sepsis labels
df_one = df_filled.groupby('Identifier')['SepsisLabel'].max().reset_index()

# Rename the columns for clarity
df_one.columns = ['Identifier', 'SepsisLabel']

Step 3: Create prompts for each patient

In [3]:
def create_narrative_paragraph(patient_id, patient_data):
    """
    Generate a narrative paragraph for a specific patient.
    """
    narrative_segments = []
    for _, row in patient_data.iterrows():
        # Extract required fields from the row
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph

def generate_df_two(df_filled):
    """
    Generate df_two containing narrative_paragraph and PatientID for each patient.
    """
    df_two_data = []

    # Group the dataframe by 'Identifier' (PatientID)
    grouped = df_filled.groupby('Identifier')

    for patient_id, patient_data in grouped:
        # Create a narrative paragraph for each patient
        narrative_paragraph = create_narrative_paragraph(patient_id, patient_data)

        # Append the result to the list
        df_two_data.append({'Identifier': patient_id, 'Prompts': narrative_paragraph})

    # Convert the list of dictionaries to a dataframe
    df_two = pd.DataFrame(df_two_data)

    return df_two

# Use the function to create df_two
df_two = generate_df_two(df_filled)

# Merge the two dataframes on the 'Identifier' column
df_combined = pd.merge(df_one, df_two, left_on='Identifier', right_on='Identifier', how='inner')

# Add dynamic prompts for recommendation generation
df_combined['FullPrompt'] = df_combined.apply(
    lambda row: f"{row['Prompts']} Classification: {'Sepsis' if row['SepsisLabel'] == 1 else 'No Sepsis'}. Recommendation:",
    axis=1
)

In [6]:
df_combined.head()

Unnamed: 0,Identifier,SepsisLabel,Prompts,FullPrompt
0,2.0,0,Patient 2.0 has the following recorded data. A...,Patient 2.0 has the following recorded data. A...
1,5.0,0,Patient 5.0 has the following recorded data. A...,Patient 5.0 has the following recorded data. A...
2,14.0,0,Patient 14.0 has the following recorded data. ...,Patient 14.0 has the following recorded data. ...
3,24.0,0,Patient 24.0 has the following recorded data. ...,Patient 24.0 has the following recorded data. ...
4,41.0,0,Patient 41.0 has the following recorded data. ...,Patient 41.0 has the following recorded data. ...


Step 4: Load Bert Tokenizer and Model for Classfication

In [7]:
# Load tokenizer and model for classification
classification_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
classification_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 5: Create a dataset Class that handles encoded prompts and labels

In [8]:
import torch
from torch.utils.data import Dataset

# Define a custom dataset class
class SepsisDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

Step 6: Split data for fine-tuning classification model


In [9]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined['Prompts'].tolist(), df_combined['SepsisLabel'].tolist(), test_size=0.2, random_state=42
)

Step 7: Tokenize data and create the datasets

In [10]:
# Tokenize data properly
train_encodings = classification_tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = classification_tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [11]:
# Create datasets
train_dataset = SepsisDataset(train_encodings, train_labels)
val_dataset = SepsisDataset(val_encodings, val_labels)

Step 8: Create Training arguments and train model

In [12]:
# Fine-tune the classification model
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,No log,0.380076
2,No log,0.379144


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=402, training_loss=0.39477952795835275, metrics={'train_runtime': 251.606, 'train_samples_per_second': 25.46, 'train_steps_per_second': 1.598, 'total_flos': 848586155790336.0, 'train_loss': 0.39477952795835275, 'epoch': 2.0})

Step 9: Install transformers library from huggingface

In [23]:
!pip uninstall -y transformers accelerate bitsandbytes timm

Found existing installation: transformers 4.47.1
Uninstalling transformers-4.47.1:
  Successfully uninstalled transformers-4.47.1
Found existing installation: accelerate 1.2.1
Uninstalling accelerate-1.2.1:
  Successfully uninstalled accelerate-1.2.1
Found existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0
Found existing installation: timm 1.0.12
Uninstalling timm-1.0.12:
  Successfully uninstalled timm-1.0.12


In [25]:
!pip install transformers accelerate bitsandbytes timm

Collecting transformers
  Downloading transformers-4.48.0-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-1.2.1-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Using cached bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting timm
  Downloading timm-1.0.13-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.3/53.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.48.0-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m77.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading accelerate-1.2.1-py3-none-any.whl (336 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

Step 10: Load falcon-7b Model

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set quantization config
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Load model and tokenizer
model_name = "tiiuae/falcon-7b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors.index.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.48G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/117 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.73M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/281 [00:00<?, ?B/s]

Step 11: Process Prompts

In [3]:
def create_narrative_for_single_patient(csv_file_path):
    """
    Create a narrative prompt for a single patient from a CSV file.

    Parameters:
        csv_file_path (str): Path to the CSV file containing the patient's data.

    Returns:
        str: Narrative paragraph describing the patient's data.
    """
    # Load the patient's data
    patient_data = pd.read_csv(csv_file_path)

    # Extract the patient identifier (assuming there's an 'Identifier' column)
    patient_id = patient_data['Identifier'].iloc[0]

    # Initialize a list to hold the narrative segments
    narrative_segments = []

    # Loop through the rows and create narrative sentences
    for _, row in patient_data.iterrows():
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph

In [29]:
import re

def extract_first_two_sentences(paragraph):
    """
    Extract the first two sentences from a paragraph using a line-by-line approach.

    Args:
        paragraph (str): Input paragraph.

    Returns:
        str: A string containing the first two sentences.
    """
    # Extract the first sentence using regex for the initial line
    first_sentence_match = re.match(r'Patient(?:[^\.]|\.(?=\d))*\.', paragraph)
    if not first_sentence_match:
        return paragraph.strip()  # If no match, return the entire paragraph

    first_sentence = first_sentence_match.group(0).strip()

    # Remove the first sentence from the paragraph
    remaining_paragraph = paragraph[len(first_sentence):].strip()

    # Extract the second sentence from the remaining paragraph
    second_sentence_match = re.match(r'(?:[^\.]|\.(?=\d))*\.', remaining_paragraph)
    if not second_sentence_match:
        return first_sentence  # If no second sentence, return only the first sentence

    second_sentence = second_sentence_match.group(0).strip()

    # Combine the first two sentences
    return f"{first_sentence} {second_sentence}"




In [32]:
# Example Patient
import pandas as pd

example_prompt = create_narrative_for_single_patient(r'/content/sample_data/OnePatient.csv')

processed_prompt = extract_first_two_sentences(example_prompt) + "Patient has sepsis possiblity of 90% . What should be the treatment recommendation given the vitals?"


Step 12: Try the model and let the model suggest output

In [33]:
# Input text
input_text = processed_prompt
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate text
outputs = model.generate(**inputs, max_new_tokens=60) #change max_new_o=tokens for different answer
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0.Patient has sepsis possiblity of 90% . What should be the treatment recommendation given the vitals?
A. Administer IV fluids and antibiotics
B. Administer IV fluids and antibiotics
C. Administer IV fluids and antibiotics
D. Administer IV fluids and antibiotics
E. Administer IV fluids and antibiotics
F. Administer IV fluids and antibiotics
G. Administer IV
