In [2]:
import openai
import json
import os
import re
import pandas as pd
from datetime import datetime
from typing import List, Dict, Any, Tuple
from langchain_community.document_loaders import PyPDFLoader
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time

In [3]:
df = pd.read_csv('data/report_findings.csv')
df = df[['Patient Counter', 'Modality', 'Exam Description', 'ages', 'Findings']]
df.head()

Unnamed: 0,Patient Counter,Modality,Exam Description,ages,Findings
0,21,MRI,MR ABDOMEN WITH AND WITHOUT CONTRAST,76,Liver: Hepatic steatosis. Minimal nodular con...
1,22,CT,CT ABDOMEN WITH CONTRAST,68,Visualized lung bases: Unremarkable Liver: H...
2,23,MRI,MR MRCP WITH AND WITHOUT CONTRAST,71,Liver: Unremarkable Gallbladder: Surgically ...
3,24,MRI,MR ABDOMEN WITH AND WITHOUT CONTRAST,66,Liver: Cirrhotic liver morphology. Untreated ...
4,25,CT,CT ABDOMEN/PELVIS WITH CONTRAST,89,Visualized lung bases: Bibasilar atelectasis ...


In [15]:
# Option 1: Pure GPT-4O response
# Option 2: With PDF RAG
# Option 3: With Flowchart embedding
# Option 4: Flowchart RAG
option_1, option_2, option_3, option_4 = True, True, True, False

In [4]:
from openai import AzureOpenAI

client = None
azure_client = True

if azure_client:
  client = AzureOpenAI(
      api_key='', # your Azure OpenAI API key
      api_version='2024-02-01',
      azure_endpoint='https://unified-api.ucsf.edu/general',
  )
  EMBEDDING_MODEL = ['text-embedding-3-small-1', 'text-embedding-3-large-1', 'text-embedding-ada-002'][0]
  MODEL = ['gpt-35-turbo', 'gpt-35-turbo-0301', 'gpt-4', 'gpt-35-turbo-16K', 'gpt-4-32K', 'gpt-4-turbo-128k', 'gpt-4o-2024-05-13'][-1]
  print("---> work on versa!")
else:
  client = openai.Client(api_key='') # your OpenAI API key
  EMBEDDING_MODEL = "text-embedding-3-small"
  MODEL = "gpt-4o"
  print("---> work on openai!")

print("--->", EMBEDDING_MODEL, MODEL)

---> work on versa!
---> text-embedding-3-small-1 gpt-4o-2024-05-13


## Archived: Test code for Flowchart RAG, not for this study

In [32]:
from langchain_community.document_loaders import JSONLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import AzureOpenAIEmbeddings
import json
import shutil
import os

CHROMA_PATH = "chroma"

embedding_function = AzureOpenAIEmbeddings(
    model = 'text-embedding-3-small-1',
    api_key='',
    openai_api_version='2024-02-01',
    azure_endpoint='https://unified-api.ucsf.edu/general',
)

In [1]:
loader = JSONLoader(file_path="data2-3-table_included.json", jq_schema=".[]", text_content=False)
documents = loader.load()

if os.path.exists(CHROMA_PATH):
    shutil.rmtree(CHROMA_PATH)
db = Chroma.from_documents(documents, embedding_function, persist_directory=CHROMA_PATH)
print(f"Save to {CHROMA_PATH}.")


Save to chroma.


In [33]:
def query_flowchart_rag(query, top_n=4):
    context = []
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    
    docs_and_scores = db.similarity_search_with_score(query)
    sorted_docs = sorted(docs_and_scores, key=lambda x: x[1], reverse=True)
    top_n_docs = sorted_docs[:top_n]
    for doc, score in top_n_docs:
        context.append(json.loads(doc.page_content))
    return context

In [12]:
age = 81
summary = '**Condition of the Largest Pancreatic Cyst:**- Size: 1.4 cm- Position: Pancreatic body- MPD Communication: None - High-risk features/stigmata: None'
query = f'Given the patient age {age} at the presentation of incidentalomas with the findings from radiology report: {summary}.  Generate the most important follow-up recommendation for the largest incidental pancreatic cyst.'
res = query_flowchart_rag(query)

0.4999521080739323
0.49671225212563813
{'paper': 'Management of Incidental Pancreatic Cysts: A White Paper of the ACR Incidental Findings Committee', 'type': 'Fig', 'location': 'Fig 1', 'year': 2017, 'title': 'Flowchart (Chart 1) specifying the management of incidental pancreatic cysts <1.5 cm. EUS ¼ endoscopic ultrasound; FNA ¼ ﬁne needle aspiration; MPD ¼ main pancreatic duct.', 'description': '< 65 years at presentation\n\n<1.5 cm Incidental pancreatic cyst\n\n- Reimage every 1 year for 5 times¹,²,⁸\n  - Stable over initial 5 years¹\n    - Reimage every 2 years for 2 times¹,²,⁸\n      - STOP if stable over MINIMUM 9 years⁵\n  - Interval growth³**\n    - Cyst is still <1.5 cm⁴\n      - Reimage every 1 year or EUS / FNA²,³,⁴,⁸\n        - STOP if cyst <1.5 cm over MINIMUM 10 years⁵\n    - Cyst is ≥1.5 cm⁴\n      - Move to Figure 2 or EUS / FNA⁴**\n\n65-79 years at presentation\n\n<1.5 cm Incidental pancreatic cyst\n\n- Reimage every 2 years for 5 times¹,²,⁸\n  - STOP if stable over 10 

## Guideline RAG Prep

In [5]:
# PDF RAG
CHUNK_SIZE = 2500
JSON_FILE_PATH = "utils/embeddings_pancreatic.json"

def load_json(file_path: str) -> Dict[str, Any]:
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    return {}

def save_json(data: Dict[str, Any], file_path: str) -> None:
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

def get_text_chunks(text: str, chunk_size: int) -> List[str]:
    step_size = 1500
    return [text[i:i+chunk_size] for i in range(0, len(text), step_size)]

def get_embedding(text: str, model=EMBEDDING_MODEL, **kwargs) -> List[float]:
    # replace newlines, which can negatively affect performance.
    text = text.replace("\n", " ")

    response = client.embeddings.create(input=[text], model=model, **kwargs)

    return response.data[0].embedding

def get_embeddings(text_chunks: List[str], model: str = EMBEDDING_MODEL) -> List[List[float]]:
    embeddings = [get_embedding(normalize_text(chunk), model=model) for chunk in text_chunks]
    return embeddings

def extract_text_from_pdf(file_path: str) -> str:
    loader = PyPDFLoader(file_path)
    pages = loader.load_and_split()
    text = "\n".join([normalize_text(page.page_content) for page in pages])
    return text

def prepare_rag(file_paths: List[str]):
    data = load_json(JSON_FILE_PATH)
    if "embedding_pairs" not in data:
        data["embedding_pairs"] = {}

    for file_path in file_paths:
        file_name = os.path.basename(file_path)
        if file_name in data.get("metadata", {}):
            print(f"Embeddings for {file_name} already exist. Skipping.")
            continue

        file_size = os.path.getsize(file_path)
        text = extract_text_from_pdf(file_path)
        chunks = get_text_chunks(text, CHUNK_SIZE)
        embeddings = get_embeddings(chunks, model=EMBEDDING_MODEL)

        for chunk, embedding in zip(chunks, embeddings):
            data["embedding_pairs"][chunk] = embedding

        if "metadata" not in data:
            data["metadata"] = {}

        data["metadata"][file_name] = {
            "file_size": file_size,
            "date": datetime.now().isoformat(),
        }

    save_json(data, JSON_FILE_PATH)

def query_rag(query: str) -> str:
    data = load_json(JSON_FILE_PATH)
    if "embedding_pairs" not in data:
        return "No embeddings found. Please prepare the RAG system first."

    chunks = list(data["embedding_pairs"].keys())
    embeddings = list(data["embedding_pairs"].values())

    query_embedding = get_embeddings([query])[0]
    similarities = cosine_similarity([query_embedding], embeddings)[0]
    # Set a thersholding value
    threshold = 0.5
    filtered_indices = [i for i in range(len(similarities)) if similarities[i] >= threshold]

    # Get the indices of the most similar chunks above the threshold
    top_indices = sorted(filtered_indices, key=lambda i: similarities[i], reverse=True)[:5]

    retrieved_chunks = [chunks[i] for i in top_indices]
    context = "\n".join(retrieved_chunks)

    return context


## Prompt

In [None]:
# Prepare the RAG system with the provided PDFs
file_paths = ['Management of Incidental Pancreatic Cysts A White Paper of the ACR Incidental Findings Committee']
prepare_rag(file_paths)

In [6]:
# Option 1 Helper Code + age + highrisk by imaging + concrete
print("additional age + high risk + brief2")

def generate_prompt_simple(row):
    # Extract the age and findings from the row
    age, finding = int(row["ages"]), row["Findings"]

    # Generate the main prompt for pancreatic cyst follow-up recommendation
    prompt = (
        f'Given the patient age {age} with the findings from the radiology report at the presentation of incidentalomas: {finding}. '
        f'Generate the most important follow-up recommendation for pancreatic cysts based on the guideline paper: Management of Incidental Pancreatic Cysts: A White Paper of the ACR Incidental Findings Committee (https://doi.org/10.1016/j.jacr.2017.03.010). '
        f'Here is some guidance for response: 1. Start with **Existence of Pancreatic Cyst:** ; Identify whether a pancreatic cyst exists/is visualized. If not, only respond: No pancreatic cyst is visualized. '
        f'2. Start with **Condition of the Largest Pancreatic Cyst:** - Size: - Position: - MPD Communication: -  High-risk features/stigmata: ; Summarize the condition of the pancreatic cyst, including the size, the position, establishment of main pancreatic duct (MPD) communication, and high-risk features/stigmata of the cyst. '
        f'If the findings have multiple dimensions for one cyst, use its longest axis diameter. For example, if the cyst has the dimensions of 1.1 x 1.5 x 1.4 cm or 1.4 x 1.5 cm, only response 1.5 cm in summarization. '
        f'Only consider the largest cyst for recommendation. '
        f'3. Start with **Follow-up Recommendation:** ; Generate the most relevant follow-up recommendation based on the cyst condition and the patient age {age}. Only mention follow-up method (and frequency) to keep the recommendation brief. '
    )
    return prompt

additional age + high risk + brief2


In [7]:
# Option 2 Helper Code + age + highrisk by imaging + concrete
print("additional age + high risk + brief2")

def generate_prompt_RAG(row):
    # Extract the age and findings from the row
    age, finding = int(row["ages"]), row["Findings"]
    query = f'Given the patient age {age} with the findings from the radiology report at the presentation of incidentalomas: {finding}. Generate the most important follow-up recommendation for the largest incidental pancreatic cyst.'
    context = query_rag(query)

    # Generate the main prompt for pancreatic cyst follow-up recommendation
    prompt = (
        f'Given the patient age {age} with the findings from the radiology report at the presentation of incidentalomas: {finding}. '
        f'Generate the most important follow-up recommendation for pancreatic cysts based on the context: {context} from the guideline paper. '
        f'Here is some guidance for response: 1. Start with **Existence of Pancreatic Cyst:** ; Identify whether a pancreatic cyst exists/is visualized. If not, only respond: No pancreatic cyst is visualized. '
        f'2. Start with **Condition of the Largest Pancreatic Cyst:** - Size: - Position: - MPD Communication: -  High-risk features/stigmata: ; Summarize the condition of the pancreatic cyst, including the size, the position, establishment of main pancreatic duct (MPD) communication, and high-risk features/stigmata of the cyst. '
        f'If the findings have multiple dimensions for one cyst, use its longest axis diameter. For example, if the cyst has the dimensions of 1.1 x 1.5 x 1.4 cm or 1.4 x 1.5 cm, only response 1.5 cm in summarization. '
        f'Only consider the largest cyst for recommendation. '
        f'3. Start with **Follow-up Recommendation:** ; Generate the most relevant follow-up recommendation based on the cyst condition and the patient age {age}. Only mention follow-up method (and frequency) to keep the recommendation brief. '
    )
    return prompt

additional age + high risk + brief2


In [22]:
# Option 3 Helper Code + age + highrisk by imaging + concrete
print("additional age + high risk + brief2")

def generate_prompt_flowchart(row, flowchart_data):
    # Extract the age and findings from the row
    age, finding = int(row["ages"]), row["Findings"]

    # Generate the main prompt for pancreatic cyst follow-up recommendation
    prompt = (
        f'Given the patient age {age} with the findings from the radiology report at the presentation of incidentalomas: {finding}. '
        f'Generate the most important follow-up recommendation for pancreatic cysts based on JSON flowchart data: {flowchart_data} from the guideline paper. '
        f'Here is some guidance for response: 1. Start with **Existence of Pancreatic Cyst:** ; Identify whether a pancreatic cyst exists/is visualized. If not, only respond: No pancreatic cyst is visualized. '
        f'2. Start with **Condition of the Largest Pancreatic Cyst:** - Size: - Position: - MPD Communication: -  High-risk features/stigmata: ; Summarize the condition of the pancreatic cyst, including the size, the position, establishment of main pancreatic duct (MPD) communication, and high-risk features/stigmata of the cyst. '
        f'If the findings have multiple dimensions for one cyst, use its longest axis diameter. For example, if the cyst has the dimensions of 1.1 x 1.5 x 1.4 cm or 1.4 x 1.5 cm, only response 1.5 cm in summarization. '
        f'Only consider the largest cyst for recommendation. '
        f'3. Start with **Follow-up Recommendation:** ; Generate the most relevant follow-up recommendation based on the cyst condition and the patient age {age}. Only mention the follow-up method (and frequency) to keep the recommendation brief. '
        f'4. Start with **Reference:** ; Include the reference location. Only respond e.g. Fig 1, Fig 2A, etc.'
    )
    return prompt

additional age + high risk + brief2


In [35]:
# Option 4 flowchartRAG Helper Code + age + highrisk by imaging + concrete
print("additional age + high risk + brief2")

def generate_prompt_flowchartRAG(row, flowchart_data):
    # Extract the age and findings from the row
    age, finding = int(row["ages"]), row["Findings"]
    query = f'Given the patient age {age} at the presentation of incidentalomas with the findings from radiology report: {finding}. Generate the most important follow-up recommendation for the largest incidental pancreatic cyst.'
    context = query_flowchart_rag(query)

    # Generate the main prompt for pancreatic cyst follow-up recommendation
    prompt = (
        f'Given the patient age {age} with the findings from the radiology report at the presentation of incidentalomas: {finding}. '
        f'Generate the most important follow-up recommendation for pancreatic cysts based on the relevant JSON flowchart data: {context} from the guideline paper. '
        f'Here is some guidance for response: 1. Start with **Existence of Pancreatic Cyst:** ; Identify whether a pancreatic cyst exists/is visualized. If not, only respond: No pancreatic cyst is visualized. '
        f'2. Start with **Condition of the Largest Pancreatic Cyst:** - Size: - Position: - MPD Communication: -  High-risk features/stigmata: ; Summarize the condition of the pancreatic cyst, including the size, the position, establishment of main pancreatic duct (MPD) communication, and high-risk features/stigmata of the cyst. '
        f'If the findings have multiple dimensions for one cyst, use its longest axis diameter. For example, if the cyst has the dimensions of 1.1 x 1.5 x 1.4 cm or 1.4 x 1.5 cm, only response 1.5 cm in summarization. '
        f'Only consider the largest cyst for recommendation. '
        f'3. Start with **Follow-up Recommendation:** ; Generate the most relevant follow-up recommendation based on the cyst condition and the patient age {age}. Only mention the follow-up method (and frequency) to keep the recommendation brief. '
        f'4. Start with **Reference:** ; Include the reference location. Only respond e.g. Fig 1, Fig 2A, etc.'
    )
    return prompt

additional age + high risk + brief2


In [11]:
def get_openai_response(prompt):
  response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.0,
    )
  return response.choices[0].message.content

## Batch

In [13]:
if option_1:
    df['option_1_raw_openai'] = ""
    print("option1")
if option_2:
    df['option_2_pdf_rag'] = ""
    print("option2")
if option_3:
    df['option_3_flowchart'] = ""
    print("option3")
if option_4:
    df['option_4_flowchartRAG'] = ""
    print("option4")

option3


In [21]:
debug_test = False
flowchart_data = load_json("utils/data2-3-table_included.json")
count = 0

for i, row in df.iterrows():
    count += 1
    if count % 10 == 0: print("--> process: ", count)

    if option_1:
        prompt = generate_prompt_simple(row)
        result = get_openai_response(prompt)
        df.at[i, 'option_1_raw_openai'] = result
        if debug_test:
            print("---> Option 1:")
            print(result)
    
    if option_2:
        prompt = generate_prompt_RAG(row)
        result = get_openai_response(prompt)
        df.at[i, 'option_2_pdf_rag'] = result
        if debug_test:
            print("---> Option 2:")
            print(result)
    
    if option_3:
        prompt = generate_prompt_flowchart(row, flowchart_data)
        result = get_openai_response(prompt)
        df.at[i, 'option_3_flowchart'] = result
        if debug_test:
            print("---> Option 3:")
            print(result)

    if option_4:
        prompt = generate_prompt_flowchartRAG(row, flowchart_data)
        result = get_openai_response(prompt)
        df.at[i, 'option_4_flowchartRAG'] = result
        if debug_test:
            print("---> Option 4:")
            print(result)

    if option_1 and option_2 and option_4:
        time.sleep(1)
    if option_1 and option_2 and option_3 and option_4:
        time.sleep(1)

df.to_csv('results.csv', index=False)

--> process:  10
