In [None]:
import re
from sklearn.metrics import confusion_matrix

In [None]:
import numpy as np
import pandas as pd
import os
import time
import sys
import matplotlib.pyplot as plt
from google.colab import auth, drive

In [None]:
auth.authenticate_user()

# Mount Google Drive
drive.mount('/content/drive')

In [None]:
import vertexai
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    HarmBlockThreshold,
    HarmCategory,
    Image,
    Part,
    SafetySetting,
)
from vertexai.batch_prediction import BatchPredictionJob
import json
from google.cloud import storage

# replace with project ID from Google Cloud Platform
PROJECT_ID = "mit-mlhc-v2"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
def extract_text(response):
  """Extracts text from the response dictionary, handling potential KeyError."""
  try:
    return response['candidates'][0]['content']['parts'][0]['text']
  except (KeyError, IndexError, TypeError):
    # Handle cases where 'parts' key is missing or empty
    print(response)
   # Or any other appropriate default value
    return ''

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model'
suffix = 'gemini_summaries_tabular'
infn = dir + '/summaries_predictions/13_gemini_summaries.csv'
outfn = dir + '/14_gemini_predictions-' + suffix + '.csv'


In [None]:
df = pd.read_csv(infn)
print(df.shape)
df['case_status'].value_counts()

In [None]:
df.loc[0]

In [None]:
def get_demo_info(row):
  age = row['age']
  race = row['race_group2']
  gender = 'female' if row['gender'] == 'F' else 'male'
  marital_status = row['marital_status']

  admission = row['admission_type']
  insurance = row['insurance_group']
  language = row['language_group']

  stroke = 'stroke' if row['Stroke_History'] == 1 else 'No'
  mi = 'myocardial infarction' if row['Myocardial_Infarction'] == 1 else 'No'
  pvd = 'peripheral vascular disease' if row['Peripheral_Vascular_Disease'] == 1 else 'No'
  cd = 'cerebrovascular disease' if row['Cerebrovascular_Disease'] == 1 else 'No'
  diabetes = 'diabetes mellitus' if row['Diabetes_Mellitus'] == 1 else 'No'
  cancer = 'cancer' if row['Cancer'] == 1 else 'No'
  disease_arr = np.array([stroke, mi, pvd, cd, diabetes, cancer])
  mask = disease_arr != 'No'
  disease_true = disease_arr[mask]
  disease = ', '.join(disease_true)

  demo = f"""The following is the clinical notes summary for a {age} year old {race} {gender} who is {marital_status}.
They were admitted to the hospital as {admission}, have {insurance} insurance, and are {language} speaking.
They have the following disease history: {disease}
  """
  return(demo)

print(get_demo_info(df.loc[0]))

In [None]:
pattern = r"(LIKELY_AD|POSSIBLE_AD|UNLIKELY_AD)"

def calculate_metrics(data, pred, verbose=True):
  y_true = data['case_status']
  y_pred = data[pred]

  # Compute confusion matrix
  try:
      tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
  except ValueError:
      # Handle edge cases where only one class is present
      tn = fp = fn = tp = 0
      for actual, pred in zip(y_true, y_pred):
          if actual == 1 and pred == 1:
              tp += 1
          elif actual == 1 and pred == 0:
              fn += 1
          elif actual == 0 and pred == 1:
              fp += 1
          elif actual == 0 and pred == 0:
              tn += 1

  # Compute fairness-related metrics
  tpr = tp / (tp + fn) if (tp + fn) > 0 else 0.0  # Sensitivity / Equal Opportunity
  fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0  # False Positive Rate / Equalized Odds
  precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0  # Precision / Predictive Parity
  sample_size = len(data)

  # Print results
  if verbose:
    print("📊 Overall Model Performance")
    print("-" * 40)
    print(f"TPR:       {tpr:.4f}")
    print(f"FPR:       {fpr:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Sample Size:  {sample_size}")
  return(tpr, fpr, precision, sample_size)

def bin_age(df):
    bin_edges = [0, 70, 80, 90, float('inf')]
    labels = ['<70', '70-80', '80-90', '>90']
    df['age_group'] = pd.cut(df['age'], bins=bin_edges, labels=labels, right=False)
    return df

def bootstrap_metrics(y_true, y_pred, n_iterations=1000):
    stats = {'TPR': [], 'FPR': [], 'Precision': []}
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})

    for _ in range(n_iterations):
        sample = data.sample(frac=1.0, replace=True)
        yt = sample['y_true']
        yp = sample['y_pred']
        try:
            tn, fp, fn, tp = confusion_matrix(yt, yp, labels=[0, 1]).ravel()
        except:
            continue  # skip samples without both classes

        tpr = tp / (tp + fn) if (tp + fn) > 0 else np.nan
        fpr = fp / (fp + tn) if (fp + tn) > 0 else np.nan
        precision = tp / (tp + fp) if (tp + fp) > 0 else np.nan

        stats['TPR'].append(tpr)
        stats['FPR'].append(fpr)
        stats['Precision'].append(precision)

    return {
        'TPR': (np.nanpercentile(stats['TPR'], 2.5), np.nanpercentile(stats['TPR'], 97.5)),
        'FPR': (np.nanpercentile(stats['FPR'], 2.5), np.nanpercentile(stats['FPR'], 97.5)),
        'Precision': (np.nanpercentile(stats['Precision'], 2.5), np.nanpercentile(stats['Precision'], 97.5)),
    }

demo_fts = ['age_group', 'gender', 'insurance_group', 'language_group', 'race_group2', 'admission_type', 'marital_status']

## Using full summaries, with tabular data

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model'
suffix = 'gemini_summaries_tabular'
infn = dir + '/summaries_predictions/13_gemini_summaries.csv'
outfn = dir + '/14_gemini_predictions-' + suffix + '.csv'

df = pd.read_csv(infn)
print(df.shape)
df['case_status'].value_counts()

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['summary']#[:50000]

        demo = get_demo_info(row)

        prompt = f"""You are a neurologist tasked with diagnosing alzheimer's disease based on a summary of a patient's clinical notes.
        Please read the following clinical notes summaries and assess whether there is documented evidence suggestive of Alzheimer's Disease (AD).
        Consider cognitive symptoms such as progressive memory loss, disorientation, language difficulties, and behavioral changes
        as well as test results such as cognitive assessments or neuroimaging findings indicative of brain atrophy or AD markers.
        Do not include temporary confusion from infection, medication, or unrelated acute illness.

        Respond with one of the following labels:
        - LIKELY_AD: clear evidence consistent with Alzheimer's Disease
        - POSSIBLE_AD: some suggestive signs, but incomplete documentation
        - UNLIKELY_AD: No indication of Alzheimer's Disease or dementia

        Also provide a 1-2 sentence justification summarizing the key evidence from the note.

        {demo}
        Clinical Notes: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_gemini_predictions'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_gemini_predictions/batch-prediction/prediction-model-2025-05-02T04:19:59.439135Z'
output_path = path + '/predictions.jsonl'

results = pd.read_json(output_path, lines=True)
results = results.join(pd.json_normalize(results["response"], "candidates"))
print(results.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
results['summary'] = results['response'].apply(extract_text)
results = results[results['summary'] != '']
print(results.shape)

In [None]:
res_df = results[['id', 'summary']].copy()
res_df.columns = ['subject_id', 'gemini_text']
res_df['gemini_pred'] = res_df['gemini_text'].str.extract(pattern)
res_df['pred'] = [0 if x == 'UNLIKELY_AD' else 1 for x in res_df['gemini_pred']]
# res_df['pred_v2'] = [1 if x == 'LIKELY_AD' else 0 for x in res_df['gemini_pred']]
print(res_df.shape)
print(res_df['gemini_pred'].value_counts())
print(res_df['pred'].value_counts())
# print(res_df['pred_v2'].value_counts())

In [None]:
final = pd.merge(res_df, df, on='subject_id', how='inner')
final = bin_age(final)

print(final.shape)
print('\nPredictions V1:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred', True)
print('\nPredictions V2:')
# tpr,fpr, precision, n = calculate_metrics(final, 'pred_v2', True)

In [None]:
likely = final[final['gemini_pred'] != 'POSSIBLE_AD'].copy()
likely['pred'] = [0 if x == 'UNLIKELY_AD' else 1 for x in likely['gemini_pred']]
print(likely['gemini_pred'].value_counts())
print(likely['pred'].value_counts())
tpr,fpr, precision, n = calculate_metrics(likely, 'pred', True)
true_pred = sum(likely['pred'] == likely['case_status'])
acc = true_pred/len(likely)
print('Accuracy: %s' % round(acc,4))

In [None]:
possible = final[final['gemini_pred'] != 'LIKELY_AD'].copy()
print(possible['gemini_pred'].value_counts())
possible['pred'] = [0 if x == 'UNLIKELY_AD' else 1 for x in possible['gemini_pred']]
print(possible['gemini_pred'].value_counts())
print(possible['pred'].value_counts())
tpr,fpr, precision, n = calculate_metrics(possible, 'pred', True)
true_pred = sum(possible['pred'] == possible['case_status'])
acc = true_pred/len(possible)
print('Accuracy: %s' % round(acc,4))

In [None]:
ctrl = final[final['ad'] == 0].loc[2]
print(ctrl['gemini_text'])
print('###')
print(ctrl['summary'])


In [None]:
ctrl

In [None]:
outfn2 = dir + '/summaries_predictions/15_gemini_predictions-' + suffix + '.csv'

final.to_csv(outfn2, index=False, header=True)


In [None]:
records = []
for group in demo_fts:
  for value in likely[group].dropna().unique():
    subset = likely[likely[group] == value]
    if len(subset) > 0:
      y_true = subset['case_status']
      y_pred = subset['pred']

      tpr, fpr, precision, sample_size = calculate_metrics(subset, 'pred', verbose=False)
      ci = bootstrap_metrics(y_true, y_pred)
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': f"{tpr:.3f} ({ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f})",
                      'FPR': f"{fpr:.3f} ({ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f})",
                      'Equalized Odds (|FPR-TPR|)': f"{(fpr-tpr):.3f} ({abs(ci['FPR'][0] - ci['TPR'][0]):.3f}–{abs(ci['FPR'][1] - ci['TPR'][1]):.3f})",
                      'Precision (Predictive Parity)': f"{precision:.3f} ({ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f})",
                      'Sample Size': len(subset)})

    else:
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': None,
                      'Equalized Odds (|FPR-TPR|)': None,
                      'Precision (Predictive Parity)': None,
                      'Sample Size': 0})

df_metrics = pd.DataFrame(records)
df_metrics.set_index(['Group', 'Value'], inplace=True)
df_metrics.to_csv(outfn, index=True, header=True)
df_metrics


In [None]:
records = []
for group in demo_fts:
  for value in final[group].dropna().unique():
    subset = final[final[group] == value]
    if len(subset) > 0:
      y_true = subset['case_status']
      y_pred = subset['pred_v1']

      tpr, fpr, precision, sample_size = calculate_metrics(subset, 'pred_v1', verbose=False)
      ci = bootstrap_metrics(y_true, y_pred)
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': f"{tpr:.3f} ({ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f})",
                      'FPR': f"{fpr:.3f} ({ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f})",
                      'Equalized Odds (|FPR-TPR|)': f"{(fpr-tpr):.3f} ({abs(ci['FPR'][0] - ci['TPR'][0]):.3f}–{abs(ci['FPR'][1] - ci['TPR'][1]):.3f})",
                      'Precision (Predictive Parity)': f"{precision:.3f} ({ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f})",
                      'Sample Size': len(subset)})

    else:
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': None,
                      'Equalized Odds (|FPR-TPR|)': None,
                      'Precision (Predictive Parity)': None,
                      'Sample Size': 0})

df_metrics = pd.DataFrame(records)
df_metrics.set_index(['Group', 'Value'], inplace=True)
df_metrics.to_csv(outfn, index=True, header=True)
df_metrics


## Using concat summaries, with tabular data

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model'
suffix = 'gemini_concat_tabular'
infn = dir + '/13_gemini_summaries.csv'
outfn = dir + '/14_gemini_predictions-' + suffix + '.csv'

df = pd.read_csv(infn)
print(df.shape)
df['case_status'].value_counts()

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['concatenated_notes']#[:50000]

        demo = get_demo_info(row)

        prompt = f"""You are a neurologist tasked with diagnosing alzheimer's disease based on a summary of a patient's clinical notes.
        Please read the following clinical notes summaries and assess whether there is documented evidence suggestive of Alzheimer's Disease (AD).
        Consider cognitive symptoms such as progressive memory loss, disorientation, language difficulties, and behavioral changes
        as well as test results such as cognitive assessments or neuroimaging findings indicative of brain atrophy or AD markers.
        Do not include temporary confusion from infection, medication, or unrelated acute illness.

        Respond with one of the following labels:
        - LIKELY_AD: clear evidence consistent with Alzheimer's Disease
        - POSSIBLE_AD: some suggestive signs, but incomplete documentation
        - UNLIKELY_AD: No indication of Alzheimer's Disease or dementia

        Also provide a 1-2 sentence justification summarizing the key evidence from the note.

        {demo}
        Clinical Notes: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_gemini_predictions'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_gemini_predictions/batch-prediction/prediction-model-2025-05-02T04:30:23.225257Z'
output_path = path + '/predictions.jsonl'

results = pd.read_json(output_path, lines=True)
results = results.join(pd.json_normalize(results["response"], "candidates"))
print(results.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
results['summary'] = results['response'].apply(extract_text)
results = results[results['summary'] != '']
print(results.shape)

In [None]:
res_df = results[['id', 'summary']].copy()
res_df.columns = ['subject_id', 'gemini_text']
res_df['gemini_pred'] = res_df['gemini_text'].str.extract(pattern)
res_df['pred_v1'] = [0 if x == 'UNLIKELY_AD' else 1 for x in res_df['gemini_pred']]
res_df['pred_v2'] = [1 if x == 'LIKELY_AD' else 0 for x in res_df['gemini_pred']]
print(res_df.shape)
print(res_df['gemini_pred'].value_counts())
print(res_df['pred_v1'].value_counts())
print(res_df['pred_v2'].value_counts())

In [None]:
final = pd.merge(res_df, df, on='subject_id', how='inner')
final = bin_age(final)

print(final.shape)
print('\nPredictions V1:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v1', True)
print('\nPredictions V2:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v2', True)

In [None]:
records = []
for group in demo_fts:
  for value in final[group].dropna().unique():
    subset = final[final[group] == value]
    if len(subset) > 0:
      y_true = subset['case_status']
      y_pred = subset['pred_v1']

      tpr, fpr, precision, sample_size = calculate_metrics(subset, 'pred_v1', verbose=False)
      ci = bootstrap_metrics(y_true, y_pred)
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': f"{tpr:.3f} ({ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f})",
                      'FPR': f"{fpr:.3f} ({ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f})",
                      'Equalized Odds (|FPR-TPR|)': f"{(fpr-tpr):.3f} ({abs(ci['FPR'][0] - ci['TPR'][0]):.3f}–{abs(ci['FPR'][1] - ci['TPR'][1]):.3f})",
                      'Precision (Predictive Parity)': f"{precision:.3f} ({ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f})",
                      'Sample Size': len(subset)})

    else:
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': None,
                      'Equalized Odds (|FPR-TPR|)': None,
                      'Precision (Predictive Parity)': None,
                      'Sample Size': 0})

df_metrics = pd.DataFrame(records)
df_metrics.set_index(['Group', 'Value'], inplace=True)
df_metrics.to_csv(outfn, index=False, header=True)
df_metrics


## Using full summaries, without tabular data

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model'
suffix = 'gemini_summaries'
infn = dir + '/13_gemini_summaries.csv'
outfn = dir + '/14_gemini_predictions-' + suffix + '.csv'

df = pd.read_csv(infn)
print(df.shape)
df['case_status'].value_counts()

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['summary']#[:50000]

        prompt = f"""You are a neurologist tasked with diagnosing alzheimer's disease based on a summary of a patient's clinical notes.
        Please read the following clinical notes summaries and assess whether there is documented evidence suggestive of Alzheimer's Disease (AD).
        Consider cognitive symptoms such as progressive memory loss, disorientation, language difficulties, and behavioral changes
        as well as test results such as cognitive assessments or neuroimaging findings indicative of brain atrophy or AD markers.
        Do not include temporary confusion from infection, medication, or unrelated acute illness.

        Respond with one of the following labels:
        - LIKELY_AD: clear evidence consistent with Alzheimer's Disease
        - POSSIBLE_AD: some suggestive signs, but incomplete documentation
        - UNLIKELY_AD: No indication of Alzheimer's Disease or dementia

        Also provide a 1-2 sentence justification summarizing the key evidence from the note.

        Clinical Notes: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_gemini_predictions'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_gemini_predictions/batch-prediction/prediction-model-2025-05-02T04:36:29.076758Z'
output_path = path + '/predictions.jsonl'

results = pd.read_json(output_path, lines=True)
results = results.join(pd.json_normalize(results["response"], "candidates"))
print(results.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
results['summary'] = results['response'].apply(extract_text)
results = results[results['summary'] != '']
print(results.shape)

In [None]:
res_df = results[['id', 'summary']].copy()
res_df.columns = ['subject_id', 'gemini_text']
res_df['gemini_pred'] = res_df['gemini_text'].str.extract(pattern)
res_df['pred_v1'] = [0 if x == 'UNLIKELY_AD' else 1 for x in res_df['gemini_pred']]
res_df['pred_v2'] = [1 if x == 'LIKELY_AD' else 0 for x in res_df['gemini_pred']]
print(res_df.shape)
print(res_df['gemini_pred'].value_counts())
print(res_df['pred_v1'].value_counts())
print(res_df['pred_v2'].value_counts())

In [None]:
final = pd.merge(res_df, df, on='subject_id', how='inner')
final = bin_age(final)

print(final.shape)
print('\nPredictions V1:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v1', True)
print('\nPredictions V2:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v2', True)

In [None]:
records = []
for group in demo_fts:
  for value in final[group].dropna().unique():
    subset = final[final[group] == value]
    if len(subset) > 0:
      y_true = subset['case_status']
      y_pred = subset['pred_v1']

      tpr, fpr, precision, sample_size = calculate_metrics(subset, 'pred_v1', verbose=False)
      ci = bootstrap_metrics(y_true, y_pred)
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': f"{tpr:.3f} ({ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f})",
                      'FPR': f"{fpr:.3f} ({ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f})",
                      'Equalized Odds (|FPR-TPR|)': f"{(fpr-tpr):.3f} ({abs(ci['FPR'][0] - ci['TPR'][0]):.3f}–{abs(ci['FPR'][1] - ci['TPR'][1]):.3f})",
                      'Precision (Predictive Parity)': f"{precision:.3f} ({ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f})",
                      'Sample Size': len(subset)})

    else:
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': None,
                      'Equalized Odds (|FPR-TPR|)': None,
                      'Precision (Predictive Parity)': None,
                      'Sample Size': 0})

df_metrics = pd.DataFrame(records)
df_metrics.set_index(['Group', 'Value'], inplace=True)
df_metrics.to_csv(outfn, index=False, header=True)
df_metrics


## Using full notes, with tabular data

In [None]:
dir = '/content/drive/MyDrive/HSPH/Courses/MIT6.7930/AI Bias for AD/AI Bias AD/Gemini Prediction Model'
suffix = 'gemini_all_tabular'
infn = dir + '/13_gemini_all.csv'
outfn = dir + '/14_gemini_predictions-' + suffix + '.csv'

df = pd.read_csv(infn)
print(df.shape)
df['case_status'].value_counts()

In [None]:
def df_to_jsonl_gcs(df, bucket_name, blob_name):
    """Converts a DataFrame to JSONL and uploads to Google Cloud Storage.

    Args:
        df: Pandas DataFrame with a 'text' column.
        bucket_name: Name of your Google Cloud Storage bucket.
        blob_name: Desired name for the JSONL file on GCS.
    """

    # Initialize a GCS client
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    if not bucket.exists():
        bucket.create(location='US')
        print(f'Bucket {bucket_name} created.')
    else:
        print(f'Bucket {bucket_name} already exists.')

    blob = bucket.blob(blob_name)

    # Write JSONL data to a string buffer
    jsonl_data = ""
    for index, row in df.iterrows():
        ### TO DO: Provide Gemini a prompt ##############################
        # Edit the prompt to tell Gemini how to handle your input text
        text = row['all_notes']#[:50000]

        demo = get_demo_info(row)

        prompt = f"""You are a neurologist tasked with diagnosing alzheimer's disease based on a summary of a patient's clinical notes.
        Please read the following clinical notes summaries and assess whether there is documented evidence suggestive of Alzheimer's Disease (AD).
        Consider cognitive symptoms such as progressive memory loss, disorientation, language difficulties, and behavioral changes
        as well as test results such as cognitive assessments or neuroimaging findings indicative of brain atrophy or AD markers.
        Do not include temporary confusion from infection, medication, or unrelated acute illness.

        Respond with one of the following labels:
        - LIKELY_AD: clear evidence consistent with Alzheimer's Disease
        - POSSIBLE_AD: some suggestive signs, but incomplete documentation
        - UNLIKELY_AD: No indication of Alzheimer's Disease or dementia

        Also provide a 1-2 sentence justification summarizing the key evidence from the note.

        {demo}

        Clinical Notes: {text}"""
        #################################################################

        json_data = {
            "id": row['subject_id'],
            "request": {
                "contents": [
                    {
                        "role": "user",
                        "parts": [{"text": prompt}]
                    }
                ],
                "generationConfig": {"temperature": 0.4, "maxOutputTokens": 2048},

            }
        }
        jsonl_data += json.dumps(json_data) + '\n'

    # Upload the JSONL data to GCS
    blob.upload_from_string(jsonl_data, content_type='application/jsonl')
    print(f"JSONL file uploaded to gs://{bucket_name}/{blob_name}")

    return f"gs://{bucket_name}/{blob_name}"

### TO DO: Change bucket name ##############################

BUCKET_NAME = 'project_gemini_predictions'
input_uri = df_to_jsonl_gcs(df, BUCKET_NAME, 'gemini_batch_requests.jsonl')

#################################################################

output_uri = f"gs://{BUCKET_NAME}/batch-prediction/"

# Submit a batch prediction job with Gemini model
batch_prediction_job = BatchPredictionJob.submit(
    source_model="gemini-1.5-flash-001",
    input_dataset=input_uri,
    output_uri_prefix=output_uri,
)

# Check job status
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name with the job: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

# Refresh the job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

# Check the location of the output
print(f"Job output location: {batch_prediction_job.output_location}")

# Example response:
#  Job output location: gs://your-bucket/gen-ai-batch-prediction/prediction-model-year-month-day-hour:minute:second.12345

In [None]:
# Load the JSONL file into a DataFrame
# once you've made your predictions, they should be
# stored at your google cloud storage bucket specified by the path
# and you should be able to download it from this path
path = 'gs://project_gemini_predictions/batch-prediction/prediction-model-2025-05-02T04:42:25.287375Z'
output_path = path + '/predictions.jsonl'

results = pd.read_json(output_path, lines=True)
results = results.join(pd.json_normalize(results["response"], "candidates"))
print(results.shape)

# Note some inputs may not generate predictions due to SAFETY constraints
results['summary'] = results['response'].apply(extract_text)
results = results[results['summary'] != '']
print(results.shape)

In [None]:
res_df = results[['id', 'summary']].copy()
res_df.columns = ['subject_id', 'gemini_text']
res_df['gemini_pred'] = res_df['gemini_text'].str.extract(pattern)
res_df['pred_v1'] = [0 if x == 'UNLIKELY_AD' else 1 for x in res_df['gemini_pred']]
res_df['pred_v2'] = [1 if x == 'LIKELY_AD' else 0 for x in res_df['gemini_pred']]
print(res_df.shape)
print(res_df['gemini_pred'].value_counts())
print(res_df['pred_v1'].value_counts())
print(res_df['pred_v2'].value_counts())

In [None]:
final = pd.merge(res_df, df, on='subject_id', how='inner')
final = bin_age(final)

print(final.shape)
print('\nPredictions V1:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v1', True)
print('\nPredictions V2:')
tpr,fpr, precision, n = calculate_metrics(final, 'pred_v2', True)

In [None]:
records = []
for group in demo_fts:
  for value in final[group].dropna().unique():
    subset = final[final[group] == value]
    if len(subset) > 0:
      y_true = subset['case_status']
      y_pred = subset['pred_v1']

      tpr, fpr, precision, sample_size = calculate_metrics(subset, 'pred_v1', verbose=False)
      ci = bootstrap_metrics(y_true, y_pred)
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': f"{tpr:.3f} ({ci['TPR'][0]:.3f}–{ci['TPR'][1]:.3f})",
                      'FPR': f"{fpr:.3f} ({ci['FPR'][0]:.3f}–{ci['FPR'][1]:.3f})",
                      'Equalized Odds (|FPR-TPR|)': f"{(fpr-tpr):.3f} ({abs(ci['FPR'][0] - ci['TPR'][0]):.3f}–{abs(ci['FPR'][1] - ci['TPR'][1]):.3f})",
                      'Precision (Predictive Parity)': f"{precision:.3f} ({ci['Precision'][0]:.3f}–{ci['Precision'][1]:.3f})",
                      'Sample Size': len(subset)})

    else:
      records.append({'Group': group,
                      'Value': value,
                      'Equal Opportunity (TPR)': None,
                      'Equalized Odds (|FPR-TPR|)': None,
                      'Precision (Predictive Parity)': None,
                      'Sample Size': 0})

df_metrics = pd.DataFrame(records)
df_metrics.set_index(['Group', 'Value'], inplace=True)
df_metrics.to_csv(outfn, index=False, header=True)
df_metrics
