In [1]:
import re

In [2]:
import numpy as np
import pandas as pd
import os
import time
import sys
import matplotlib.pyplot as plt
from google.colab import auth, drive


In [None]:
auth.authenticate_user()

# Mount Google Drive
drive.mount('/content/drive')

In [4]:
import vertexai
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Image,
    Part,
    SafetySetting,
)
from vertexai.batch_prediction import BatchPredictionJob
import json
from google.cloud import storage

# replace with project ID from Google Cloud Platform
PROJECT_ID = "mit-mlhc-v2"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [5]:
def extract_text(response):
  """Extracts text from the response dictionary, handling potential KeyError."""
  try:
    return response['candidates'][0]['content']['parts'][0]['text']
  except (KeyError, IndexError, TypeError):
    # Handle cases where 'parts' key is missing or empty
    print(response)
   # Or any other appropriate default value
    return ''

In [6]:
def concatenate_notes(group):
  return '\n\n'.join(f"{row['hadm_id_long']}\n{row['summary']}" for _, row in group.iterrows())


# Summarise Clinical Notes

- get summaries from Gemini
- merge summaries by subject_id, in order
- get second summary from *Gemini*

## Regex masked

In [None]:
fn = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model/12_regex_masked_concat.csv'
df = pd.read_csv(fn)
df

### First pass summaries

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['concatenated_notes']#[:50000]

        prompt = f"""You are a neurologist tasked with providing observations and summaries of clinical notes. Each clinical note corresponds to one hospital admission for a patient. Based on the clinical notes provided, please comment on each of the domains listed below:
        1) For cognitive status, describe memory, attention, language, and executive function findings. Please also describe any current sleep patterns and any notable changes in sleep patterns that are documented.
        2) For neurological findings, describe the circumstances and results of any motor and sensory tests performed including assessment of reflexes, gait, coordination, and imaging results if mentioned.
        3) For functional abilities, describe the patient's level of independence in activities of daily life and any changes from previous status that are noted.
        4) For relevant medical history, list comorbidities and chronic conditions, especially ones that may affect cognitive or functional status. Please also describe current symptoms and any treatments received.
         Please use precise clinical language suitable for a medical professional audience.
        Here are the patient's clinical notes for their hospital stay: {text}"""
        #################################################################

        json_data = {
            "id": row['id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_summary_regex'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_summary_regex/batch-prediction/prediction-model-2025-05-01T16:54:53.974046Z'
output_path = path + '/predictions.jsonl'

summaries_df = pd.read_json(output_path, lines=True)
summaries_df = summaries_df.join(pd.json_normalize(summaries_df["response"], "candidates"))
print(summaries_df.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
summaries_df['summary'] = summaries_df['response'].apply(extract_text)
summaries_df = summaries_df[summaries_df['summary'] != '']
summaries_df.shape

In [None]:
summaries_df = summaries_df[['id', 'summary']]
summaries_df.columns = ['id_gemini', 'summary']
summaries_df

In [None]:
df['id_gemini'] = df['id'].str.replace(r'[_-]', '', regex=True).astype(int)
df_summs = pd.merge(df, summaries_df, on='id_gemini', how='inner')
df_summs[['subject_id', 'hadm_id']] = df_summs['id'].str.split('_', n=1, expand=True)
df_summs = df_summs.drop(['id', 'id_gemini', 'concatenated_notes'], axis=1)
df_summs = df_summs.sort_values(by=['subject_id', 'admityear', 'admitmonth', 'admitday'])
df_summs['hadm_id_long'] = 'Hospital Admittance ID: ' + df_summs['hadm_id']
df_concat = df_summs.groupby('subject_id').apply(concatenate_notes).reset_index(name='concatenated_notes')
df_concat

In [None]:
df_clean = df_summs.copy()
df_clean = df_clean.drop(['admityear', 'admitmonth', 'admitday', 'summary', 'hadm_id', 'hadm_id_long'], axis=1)
df_clean = df_clean.drop_duplicates()
df_clean = df_clean[~df_clean.duplicated(subset='subject_id', keep='last')]
df_clean.shape

In [None]:
df_regex1 = pd.merge(df_clean, df_concat, on='subject_id', how='inner')
print(df_regex1.shape)
df_regex1['case_status'].value_counts()

### Second pass summaries

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['concatenated_notes']#[:50000]

        prompt = f"""You are a neurologist tasked with providing a single summary of clinical notes. The clinical notes provided have been summarized for each hospital stay, but we want you to summarize them further so that we have a single summary for each patient. Based on the clinical notes provided, please summarise each of the domains listed below across the visits:
        1) For cognitive status, describe memory, attention, language, and executive function findings. Please also describe any current sleep patterns and any notable changes in sleep patterns that are documented.
        2) For neurological findings, describe the circumstances and results of any motor and sensory tests performed including assessment of reflexes, gait, coordination, and imaging results if mentioned.
        3) For functional abilities, describe the patient's level of independence in activities of daily life and any changes from previous status that are noted.
        4) For relevant medical history, list comorbidities and chronic conditions, especially ones that may affect cognitive or functional status. Please also describe current symptoms and any treatments received.
         Please use precise clinical language suitable for a medical professional audience.
        Here are the patient's clinical notes for all their hospital stays: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_summary_regex'
input_uri = df_to_jsonl_gcs(df_regex1, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_summary_regex/batch-prediction/prediction-model-2025-05-01T17:52:06.906636Z'
output_path = path + '/predictions.jsonl'

summaries_df = pd.read_json(output_path, lines=True)
summaries_df = summaries_df.join(pd.json_normalize(summaries_df["response"], "candidates"))
print(summaries_df.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
summaries_df['summary'] = summaries_df['response'].apply(extract_text)
summaries_df = summaries_df[summaries_df['summary'] != '']
print(summaries_df.shape)

summaries_df = summaries_df[['id', 'summary']]
summaries_df.columns = ['subject_id', 'gemini_summary']
summaries_df

In [None]:
df_regex1['subject_id'] = df_regex1['subject_id'].astype(int)
df_regex2 = pd.merge(df_regex1, summaries_df, on='subject_id', how='inner')
print(df_regex2.shape)
fn = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model/13_regex_summaries.csv'
df_regex2.to_csv(fn, index=False, header=True)
df_regex2['case_status'].value_counts()

## Gemini masked

In [None]:
fn = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model/12_gemini_masked_concat.csv'
df = pd.read_csv(fn)
df

### First pass summaries

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['concatenated_notes']#[:50000]

        prompt = f"""You are a neurologist tasked with providing observations and summaries of clinical notes. Each clinical note corresponds to one hospital admission for a patient. Based on the clinical notes provided, please comment on each of the domains listed below:
        1) For cognitive status, describe memory, attention, language, and executive function findings. Please also describe any current sleep patterns and any notable changes in sleep patterns that are documented.
        2) For neurological findings, describe the circumstances and results of any motor and sensory tests performed including assessment of reflexes, gait, coordination, and imaging results if mentioned.
        3) For functional abilities, describe the patient's level of independence in activities of daily life and any changes from previous status that are noted.
        4) For relevant medical history, list comorbidities and chronic conditions, especially ones that may affect cognitive or functional status. Please also describe current symptoms and any treatments received.
         Please use precise clinical language suitable for a medical professional audience.
        Here are the patient's clinical notes for their hospital stay: {text}"""
        #################################################################

        json_data = {
            "id": row['id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_summary_gemini'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_summary_gemini/batch-prediction/prediction-model-2025-05-02T01:00:51.356206Z'
output_path = path + '/predictions.jsonl'

summaries_df = pd.read_json(output_path, lines=True)
summaries_df = summaries_df.join(pd.json_normalize(summaries_df["response"], "candidates"))
print(summaries_df.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
summaries_df['summary'] = summaries_df['response'].apply(extract_text)
summaries_df = summaries_df[summaries_df['summary'] != '']
summaries_df.shape

In [None]:
summaries_df = summaries_df[['id', 'summary']]
summaries_df.columns = ['id_gemini', 'summary']
summaries_df

In [None]:
df['id_gemini'] = df['id'].str.replace(r'[_-]', '', regex=True).astype(int)
df_summs = pd.merge(df, summaries_df, on='id_gemini', how='inner')
df_summs[['subject_id', 'hadm_id']] = df_summs['id'].str.split('_', n=1, expand=True)
df_summs = df_summs.drop(['id', 'id_gemini', 'concatenated_notes'], axis=1)
df_summs = df_summs.sort_values(by=['subject_id', 'admityear', 'admitmonth', 'admitday'])
df_summs['hadm_id_long'] = 'Hospital Admittance ID: ' + df_summs['hadm_id']
df_concat = df_summs.groupby('subject_id').apply(concatenate_notes).reset_index(name='concatenated_notes')
df_concat

In [None]:
df_clean = df_summs.copy()
df_clean = df_clean.drop(['admityear', 'admitmonth', 'admitday', 'summary', 'hadm_id', 'hadm_id_long'], axis=1)
df_clean = df_clean.drop_duplicates()
df_clean = df_clean[~df_clean.duplicated(subset='subject_id', keep='last')]
df_clean.shape

In [None]:
df_gemini1 = pd.merge(df_clean, df_concat, on='subject_id', how='inner')
print(df_gemini1.shape)
df_gemini1['case_status'].value_counts()

### Second pass summaries

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['concatenated_notes']#[:50000]

        prompt = f"""You are a neurologist tasked with providing a single summary of clinical notes. The clinical notes provided have been summarized for each hospital stay, but we want you to summarize them further so that we have a single summary for each patient. Based on the clinical notes provided, please summarise each of the domains listed below across the visits:
        1) For cognitive status, describe memory, attention, language, and executive function findings. Please also describe any current sleep patterns and any notable changes in sleep patterns that are documented.
        2) For neurological findings, describe the circumstances and results of any motor and sensory tests performed including assessment of reflexes, gait, coordination, and imaging results if mentioned.
        3) For functional abilities, describe the patient's level of independence in activities of daily life and any changes from previous status that are noted.
        4) For relevant medical history, list comorbidities and chronic conditions, especially ones that may affect cognitive or functional status. Please also describe current symptoms and any treatments received.
         Please use precise clinical language suitable for a medical professional audience.
        Here are the patient's clinical notes for all their hospital stays: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_summary_gemini'
input_uri = df_to_jsonl_gcs(df_gemini1, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

In [None]:
# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_summary_gemini/batch-prediction/prediction-model-2025-05-02T01:12:04.668842Z'
output_path = path + '/predictions.jsonl'

summaries_df = pd.read_json(output_path, lines=True)
summaries_df = summaries_df.join(pd.json_normalize(summaries_df["response"], "candidates"))
print(summaries_df.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
summaries_df['summary'] = summaries_df['response'].apply(extract_text)
summaries_df = summaries_df[summaries_df['summary'] != '']
print(summaries_df.shape)

summaries_df = summaries_df[['id', 'summary']]
summaries_df.columns = ['subject_id', 'summary']
summaries_df

In [None]:
df_gemini1['subject_id'] = df_gemini1['subject_id'].astype(int)
df_gemini2 = pd.merge(df_gemini1, summaries_df, on='subject_id', how='inner')
print(df_gemini2.shape)
fn = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model/13_gemini_summaries.csv'
df_gemini2.to_csv(fn, index=False, header=True)
df_gemini2['case_status'].value_counts()