In [None]:
import pandas as pd
import numpy as np
import glob
import os
import scipy.stats as ss
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, AutoModelForSequenceClassification
%matplotlib inline

# Load the cleaned dataset
df_filled = pd.read_csv('/content/drive/MyDrive/smaller_sepsis_data.csv')

# Create a new dataframe with unique patient IDs and their sepsis labels
df_one = df_filled.groupby('Identifier')['SepsisLabel'].max().reset_index()

# Rename the columns for clarity
df_one.columns = ['Identifier', 'SepsisLabel']

def create_narrative_paragraph(patient_id, patient_data):
    """
    Generate a narrative paragraph for a specific patient.
    """
    narrative_segments = []
    for _, row in patient_data.iterrows():
        # Extract required fields from the row
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph

def generate_df_two(df_filled):
    """
    Generate df_two containing narrative_paragraph and PatientID for each patient.
    """
    df_two_data = []

    # Group the dataframe by 'Identifier' (PatientID)
    grouped = df_filled.groupby('Identifier')

    for patient_id, patient_data in grouped:
        # Create a narrative paragraph for each patient
        narrative_paragraph = create_narrative_paragraph(patient_id, patient_data)

        # Append the result to the list
        df_two_data.append({'Identifier': patient_id, 'Prompts': narrative_paragraph})

    # Convert the list of dictionaries to a dataframe
    df_two = pd.DataFrame(df_two_data)

    return df_two

# Use the function to create df_two
df_two = generate_df_two(df_filled)

# Merge the two dataframes on the 'Identifier' column
df_combined = pd.merge(df_one, df_two, left_on='Identifier', right_on='Identifier', how='inner')

# Add dynamic prompts for recommendation generation
df_combined['FullPrompt'] = df_combined.apply(
    lambda row: f"{row['Prompts']} Classification: {'Sepsis' if row['SepsisLabel'] == 1 else 'No Sepsis'}. Recommendation:",
    axis=1
)

# Split data for fine-tuning classification model
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_combined['Prompts'].tolist(), df_combined['SepsisLabel'].tolist(), test_size=0.2, random_state=42
)

In [None]:
# Load tokenizer and model for classification
classification_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
classification_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import torch
from torch.utils.data import Dataset

# Define a custom dataset class
class SepsisDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Tokenize data properly
train_encodings = classification_tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = classification_tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

# Create datasets
train_dataset = SepsisDataset(train_encodings, train_labels)
val_dataset = SepsisDataset(val_encodings, val_labels)

# Fine-tune the classification model
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    fp16=True
)

trainer = Trainer(
    model=classification_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch,Training Loss,Validation Loss
1,No log,0.38027
2,No log,0.379349


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=402, training_loss=0.39481091855177236, metrics={'train_runtime': 88.5974, 'train_samples_per_second': 72.305, 'train_steps_per_second': 4.537, 'total_flos': 848586155790336.0, 'train_loss': 0.39481091855177236, 'epoch': 2.0})

In [None]:
# Save fine-tuned classification model
classification_model.save_pretrained("./fine_tuned_classification_model")
classification_tokenizer.save_pretrained("./fine_tuned_classification_model")

# Load the GPT-2 tokenizer and model for recommendation
gpt_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

def predict_classification(input_text):
    """
    Predict classification using the fine-tuned classification model.
    """
    inputs = classification_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    outputs = classification_model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return "Sepsis" if predicted_label == 1 else "No Sepsis"

def predict_recommendation(input_prompt):
    """
    Generate a recommendation using the pretrained GPT-2 model.
    """
    inputs = gpt_tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=512)
    outputs = gpt_model.generate(inputs["input_ids"], max_length=100, num_return_sequences=1, temperature=0.7)
    recommendation = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return recommendation

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [3]:
def create_narrative_for_single_patient(csv_file_path):
    """
    Create a narrative prompt for a single patient from a CSV file.

    Parameters:
        csv_file_path (str): Path to the CSV file containing the patient's data.

    Returns:
        str: Narrative paragraph describing the patient's data.
    """
    # Load the patient's data
    patient_data = pd.read_csv(csv_file_path)

    # Extract the patient identifier (assuming there's an 'Identifier' column)
    patient_id = patient_data['Identifier'].iloc[0]

    # Initialize a list to hold the narrative segments
    narrative_segments = []

    # Loop through the rows and create narrative sentences
    for _, row in patient_data.iterrows():
        hour = row['Hour']
        hr = row['HR']
        o2sat = row['O2Sat']
        sbp = row['SBP']
        map_value = row['MAP']
        resp = row['Resp']

        # Create a narrative sentence for the row
        segment = (
            f"At hour {hour}, the patient's vitals were as follows: "
            f"Heart Rate (HR) was {hr}, Oxygen Saturation (O2Sat) was {o2sat}, "
            f"Systolic Blood Pressure (SBP) was {sbp}, Mean Arterial Pressure (MAP) was {map_value}, "
            f"and Respiratory Rate (Resp) was {resp}."
        )
        narrative_segments.append(segment)

    # Combine all segments into a single paragraph
    narrative_paragraph = (
        f"Patient {patient_id} has the following recorded data. " +
        " ".join(narrative_segments)
    )

    return narrative_paragraph

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classification_model.to(device)
gpt_model.to(device)


NameError: name 'torch' is not defined

In [None]:
def predict_classification(input_text):
    """
    Predict classification using the fine-tuned classification model.
    """
    inputs = classification_tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to device
    outputs = classification_model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return "Sepsis" if predicted_label == 1 else "No Sepsis"

In [None]:
# Example Usage
example_prompt = create_narrative_for_single_patient(r'/content/OnePatient.csv')
classification = predict_classification(example_prompt)
print("Predicted Classification:", classification)


# Load the GPT-2 tokenizer and model
gpt_tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
gpt_model = AutoModelForCausalLM.from_pretrained("distilgpt2")

# Set pad_token_id to eos_token_id to avoid warnings
gpt_tokenizer.pad_token = gpt_tokenizer.eos_token

def predict_recommendation(input_prompt):
    """
    Generate a recommendation using the pretrained GPT-2 model with improved parameters.
    """
    inputs = gpt_tokenizer(input_prompt, return_tensors="pt", truncation=True, max_length=512, padding=True)
    outputs = gpt_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Add attention mask
        max_length=10000,  # Limit the maximum length to keep outputs concise
        temperature=0.7,  # Use sampling for more creative responses
        top_k=50,         # Consider the top 50 tokens
        top_p=0.95,       # Use nucleus sampling
        do_sample=True    # Enable sampling
    )
    recommendation = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return recommendation




Predicted Classification: No Sepsis


In [None]:
# Example Usage
recommendation_prompt = f"{example_prompt} Classification: {classification}. Can i have health recommendation:"
recommendation = predict_recommendation(recommendation_prompt)
print("Generated Recommendation:", recommendation)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Recommendation: Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0. At hour 73.0, the patient's vitals were as follows: Heart Rate (HR) was 85.0, Oxygen Saturation (O2Sat) was 96.0, Systolic Blood Pressure (SBP) was 120.0, Mean Arterial Pressure (MAP) was 79.0, and Respiratory Rate (Resp) was 18.0. At hour 74.0, the patient's vitals were as follows: Heart Rate (HR) was 95.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 114.0, Mean Arterial Pressure (MAP) was 75.0, and Respiratory Rate (Resp) was 18.0. At hour 75.0, the patient's vitals were as follows: Heart Rate (HR) was 101.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 113.0, Mean Arterial Pressure (MAP) was 78.0, and Respiratory Rate (Resp) was 18.0.

In [26]:
!pip install transformers accelerate




In [33]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl (69.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.1/69.1 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.0


In [36]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q -U einops

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for peft (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for accelerate (pyproject.toml) ... [?25l[?25hdone


In [37]:
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import transformers

In [39]:
import bitsandbytes as bnb
print(bnb.__version__)  # Ensure this is the latest version


0.45.0


In [40]:
import transformers
print(transformers.__version__)  # Should be 4.30.0 or higher


4.47.1


In [41]:
!pip install nvidia-pyindex
!pip install nvidia-cuda-runtime-cu11
!pip install nvidia-cublas-cu11


Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8419 sha256=87f040b01f8aca1ec08d99f935c40700c2f1e3be34f58bae80d93f21f7bdcad0
  Stored in directory: /root/.cache/pip/wheels/2c/af/d0/7a12f82cab69f65d51107f48bcd6179e29b9a69a90546332b3
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9
Collecting nvidia-cuda-runtime-cu11
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl (875 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m875.6/875.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hIn

In [42]:
!pip install -U bitsandbytes



In [48]:
!pip uninstall -y transformers accelerate bitsandbytes timm


Found existing installation: transformers 4.48.0.dev0
Uninstalling transformers-4.48.0.dev0:
  Successfully uninstalled transformers-4.48.0.dev0
Found existing installation: accelerate 1.2.0.dev0
Uninstalling accelerate-1.2.0.dev0:
  Successfully uninstalled accelerate-1.2.0.dev0
Found existing installation: bitsandbytes 0.45.0
Uninstalling bitsandbytes-0.45.0:
  Successfully uninstalled bitsandbytes-0.45.0
Found existing installation: timm 1.0.12
Uninstalling timm-1.0.12:
  Successfully uninstalled timm-1.0.12


In [1]:
!pip install transformers accelerate bitsandbytes timm




In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Set quantization config
quantization_config = BitsAndBytesConfig(load_in_4bit=True)

# Load model and tokenizer
model_name = "tiiuae/falcon-7b"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Input text
input_text = "What are the benefits of renewable energy?"
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate text
outputs = model.generate(**inputs, max_new_tokens=50)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Response:", response)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: What are the benefits of renewable energy?
Renewable energy is a great way to reduce your carbon footprint and save money on your energy bills. It can also help to reduce the impact of climate change.
Renewable energy is a great way to reduce your carbon footprint and save money on


In [17]:
# Example Usage
import pandas as pd

example_prompt = create_narrative_for_single_patient(r'/content/sample_data/OnePatient.csv')

print(example_prompt)

promptu = "Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0. " + "Patient has sepsis possiblity of 90% . What should be the treatment recommendation given the vitals?"



Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0. At hour 73.0, the patient's vitals were as follows: Heart Rate (HR) was 85.0, Oxygen Saturation (O2Sat) was 96.0, Systolic Blood Pressure (SBP) was 120.0, Mean Arterial Pressure (MAP) was 79.0, and Respiratory Rate (Resp) was 18.0. At hour 74.0, the patient's vitals were as follows: Heart Rate (HR) was 95.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 114.0, Mean Arterial Pressure (MAP) was 75.0, and Respiratory Rate (Resp) was 18.0. At hour 75.0, the patient's vitals were as follows: Heart Rate (HR) was 101.0, Oxygen Saturation (O2Sat) was 97.0, Systolic Blood Pressure (SBP) was 113.0, Mean Arterial Pressure (MAP) was 78.0, and Respiratory Rate (Resp) was 18.0. At hour 76.0, the patient

In [19]:
# Input text
input_text = promptu
inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

# Generate text
outputs = model.generate(**inputs, max_new_tokens=60) #change max_new_o=tokens for different answer
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Response:", response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Response: Patient 8978 has the following recorded data. At hour 72.0, the patient's vitals were as follows: Heart Rate (HR) was 83.0, Oxygen Saturation (O2Sat) was 98.0, Systolic Blood Pressure (SBP) was 121.0, Mean Arterial Pressure (MAP) was 76.0, and Respiratory Rate (Resp) was 20.0. Patient has sepsis possiblity of 90% . What should be the treatment recommendation given the vitals?
A. Administer IV fluids and antibiotics
B. Administer IV fluids and antibiotics
C. Administer IV fluids and antibiotics
D. Administer IV fluids and antibiotics
E. Administer IV fluids and antibiotics
F. Administer IV fluids and antibiotics
G. Administer IV
