In [1]:
# Importing/installing libraries
!pip install -U langchain-community faiss-cpu langchain-openai tiktoken langchain jq unstructured

Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.5-py3-none-any.whl.metadata (2.6 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting jq
  Downloading jq-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting unstructured
  Downloading unstructured-0.16.4-py3-none-any.whl.metadata (24 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Download

In [2]:
!pip install sentence-transformers



In [3]:
import os

# Setting API key
os.environ['OPENAI_API_KEY'] = "open-ai-key-here"

In [4]:
# We store the StasPearls data on Colab, so we need to mount it
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Testing on one article

In [5]:
# Imports
from langchain.document_loaders import JSONLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain.llms import OpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
    UnstructuredFileLoader
)
from langchain_community.vectorstores import StarRocks
from langchain_community.vectorstores.starrocks import StarRocksSettings
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain_text_splitters import TokenTextSplitter

from langchain_community.document_loaders.directory import DirectoryLoader

from langchain.document_loaders.json_loader import JSONLoader

# Initialize the JSONLoader with the path to your JSON file and the appropriate configuration
loader = JSONLoader(
    file_path='/content/drive/MyDrive/statspearls_processed/statspearls_processedarticle-100024.jsonl', # Change to your path
    jq_schema='.',  # Adjust based on your JSON structure
    content_key='content',  # The key under which the document content is stored
    json_lines=True  # Set to True if your file is in JSON Lines format
)

# Langchain RAG workflow
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(docs, embeddings)
print(db.index.ntotal)

26


In [6]:
# Creating db straight from the documents/pre split
db = FAISS.from_documents(documents, embeddings)

In [7]:
# `db` is your FAISS index from the previous step

retriever = db.as_retriever()

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [8]:
# Testing the pipeline
chain.invoke("What is this pubmed article about?")

'The article is about the pathogenesis of coronary artery disease and its progression to CTO (Chronic Total Occlusion) lesions. It discusses the contributing factors to the disease, such as the upregulation of immunologic and inflammatory markers, endothelial dysfunction, and cholesterol accumulation. The article also mentions patient education focusing on risk factor reduction and lifestyle modifications. Additionally, it provides data from a multicenter registry on success rates, complication rates, and health status benefits of CTO PCI (Percutaneous Coronary Intervention).'

In [9]:
# Setting the path to StasPearls folder
DRIVE_FOLDER = '/content/drive/MyDrive/statspearls_processed'

## Loading the json pubmed articles

In [10]:
from langchain.document_loaders import DirectoryLoader, TextLoader

loader = DirectoryLoader(DRIVE_FOLDER, glob='**/*.jsonl', show_progress=True, loader_cls=TextLoader)

documents = loader.load()

print(f'document count: {len(documents)}')
print(documents[0] if len(documents) > 0 else None)

100%|██████████| 2332/2332 [00:49<00:00, 47.17it/s] 

document count: 2332
page_content='{"id": "D:\\statpearls_NBK430685\\article-26222_0", "title": "Opioid Withdrawal -- Continuing Education Activity", "content": "Opioid withdrawal syndrome is a life-threatening condition resulting from opioid dependence. Opioids are a group of drugs used to manage severe pain and include morphine, heroin, oxycontin, codeine, methadone, and hydromorphone. Opioids are sometimes misused, as they can assist with mental relaxation and pain relief and can produce a sense of euphoria. Chronic opioid use can lead to the development of potentially incapacitating dependence. This activity describes the evaluation and management of opioid withdrawal and highlights the interprofessional team's role in improving care for affected patients.", "contents": "Opioid Withdrawal -- Continuing Education Activity. Opioid withdrawal syndrome is a life-threatening condition resulting from opioid dependence. Opioids are a group of drugs used to manage severe pain and include m




In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings

# Splitting the corpus and creating vector db
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # OpenAIEmbeddings()

# This step takes a bit longer
db = FAISS.from_documents(documents, embeddings)
print(db.index.ntotal)

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") # OpenAIEmbeddings()
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2332


In [12]:
retriever = db.as_retriever(search_kwargs={"k": 5}) # Setting top k similarity

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [13]:
# Testing query
chain.invoke("What are the guidelines for treating coronary artery disease?")

'The guidelines for treating coronary artery disease (CAD) include a combination of lifestyle modifications, medical management, and in some cases, surgical interventions such as percutaneous coronary intervention (PCI) or coronary artery bypass grafting (CABG). The specific treatment approach depends on the severity and extent of the disease, the presence of symptoms, and the patient\'s overall health status.\n\n1. **Lifestyle Modifications**: Patients are advised to adopt a heart-healthy lifestyle that includes a balanced diet, regular physical activity, smoking cessation, and weight management. These changes can help slow the progression of CAD and reduce the risk of complications.\n\n2. **Medical Management**: Medications play a crucial role in managing CAD. Commonly prescribed medications include antiplatelet agents (e.g., aspirin) to prevent blood clots, statins to lower cholesterol levels, beta-blockers and ACE inhibitors to manage blood pressure and reduce the heart\'s workload

## Setting the patient's profile

In [14]:
patient_recent_profile = """
icd_type_2_diabetes_mellitus_without_complications_E119
icd_morbid_(severe)_obesity_due_to_excess_calories_E6601
icd_mixed_hyperlipidemia_E782, icd_essential_(primary)_hypertension_I10
icd_athscl_heart_disease_of_native_coronary_artery_w/o_ang_pctrs_I2510
icd_chronic_obstructive_pulmonary_disease,_unspecified_J449
icd_gastro-esophageal_reflux_disease_without_esophagitis_K219
icd_other_chest_pain_R0789, icd_chest_pain,_unspecified_R079
icd_long_term_(current)_use_of_insulin_Z794
icd_long_term_(current)_use_of_aspirin_Z7982
medication_Albuterol Sulfate_90_ug/1
medication_Citalopram Hydrobromide_20_mg/1
medication_Clonidine Hydrochloride_.1_mg/1
medication_Insulin Glargine_100_[iU]/mL
medication_Levothyroxine Sodium_.05_mg/1
medication_Levothyroxine Sodium_100_ug/1
medication_Levothyroxine Sodium_125_ug/1
medication_Morphine Sulfate_4_mg/mL
medication_Novolog_100_[iU]/mL
medication_Ondansetron_2_mg/mL
medication_Pantoprazole Sodium_40_mg/1
medication_Rosuvastatin Calcium_20_mg/1
medication_Spikevax_50_ug/.5mL
medication_Sucralfate_1_g/1
medication_Sucralfate_1_g/10mL
"""

## Generating questions for improving patient care for a given case, which will be used in RAG

In [15]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template("""
## Role: As a primary care physician specializing in the integration of predictive analytics into clinical practice, you possess extensive expertise in interpreting complex data regarding disease progression, medication effects, and laboratory results. Your role involves utilizing advanced AI tools to predict patient health trajectories and formulate preemptive clinical strategies. This includes generating relevant clinical questions based on predictive data to guide the adaptation of patient care plans, aiming to prevent the worsening of conditions and optimize overall patient outcomes. Your expertise extends to understanding the nuances of how changes in health indicators can inform the adjustment of therapies and interventions in a personalized manner.

## Task: Generate up to 4 clinical questions designed to address the current health risk status of a patient, using recent clinical information.

## Guidelines:
**Data Input:
**Patient's Recent Medical Profile:** Use the provided data from the patient's most recent clinical information over the last 12 weeks, including diagnoses, medications (with specific dosages), and any abnormal labs. This comprehensive profile is encapsulated under the variable recent_medical_profile below.
**Patient Risk:** Consider the information indicating that the patient is at risk of transitioning to a cluster with an increased level of hospital admissions.
**Objective:** The clinical questions should focus on exploring therapeutic optimization opportunities based on the patient's current medical status and the associated risks. The questions should be formulated to fetch the latest clinical guidelines and recommendations that are most relevant to the patient’s specific conditions and treatments.
**Purpose:** These questions are intended to facilitate targeted therapeutic adjustments. The aim is to effectively manage and potentially reduce the patient's risk of worsening health outcomes and increasing hospital admissions.

## Expected Output: A structured list of up to 4 clinical questions. Each question should be deeply rooted in the patient's current medical information and aimed at identifying actionable steps for therapeutic optimization. The questions should be specific enough to guide the retrieval of targeted clinical guidelines to refine and enhance the patient's treatment plan. The questions should be in a simple format that can be caputred with this regex pattern:  (r'\d+\.\s+(.*?\?)') !!!

## Patient's Most Recent Medical Profile:
{patient_recent_profile}
""")

output_parser = StrOutputParser()

chain = prompt | model | output_parser

input_text = chain.invoke({'patient_recent_profile': patient_recent_profile})

In [16]:
print(input_text)

1. Given the patient's diagnosis of type 2 diabetes mellitus without complications and current use of Insulin Glargine and Novolog, how can we optimize the insulin regimen to better control glycemic levels and prevent potential hospital admissions related to hyperglycemia or hypoglycemia?
2. Considering the patient's severe obesity and mixed hyperlipidemia alongside the current use of Rosuvastatin Calcium, is there a need to adjust the lipid-lowering therapy to further reduce the risk of cardiovascular events, which could lead to hospital admissions?
3. In light of the patient's chronic obstructive pulmonary disease and the prescription of Albuterol Sulfate, should we consider the addition of a long-acting bronchodilator or corticosteroid to decrease the frequency of exacerbations and subsequent hospital visits?
4. With the patient's history of essential hypertension and the use of Clonidine Hydrochloride, is there evidence to support reevaluating the antihypertensive therapy to includ

## Compiling questions in a dictionary

In [17]:
import re

# Regular expression to match the questions
pattern = re.compile(r'\d+\.\s+(.*?\?)')

# Extracting questions using the pattern
questions = pattern.findall(input_text)

# Storing questions in a dictionary
questions_dict = {i + 1: questions[i].strip() for i in range(len(questions))}

# Print the questions dictionary to verify
for key, value in questions_dict.items():
    print(f"Question {key}: {value}")

Question 1: Given the patient's diagnosis of type 2 diabetes mellitus without complications and current use of Insulin Glargine and Novolog, how can we optimize the insulin regimen to better control glycemic levels and prevent potential hospital admissions related to hyperglycemia or hypoglycemia?
Question 2: Considering the patient's severe obesity and mixed hyperlipidemia alongside the current use of Rosuvastatin Calcium, is there a need to adjust the lipid-lowering therapy to further reduce the risk of cardiovascular events, which could lead to hospital admissions?
Question 3: In light of the patient's chronic obstructive pulmonary disease and the prescription of Albuterol Sulfate, should we consider the addition of a long-acting bronchodilator or corticosteroid to decrease the frequency of exacerbations and subsequent hospital visits?
Question 4: With the patient's history of essential hypertension and the use of Clonidine Hydrochloride, is there evidence to support reevaluating th

## Running RAG on questions

For each question we will run retrieval and generate a response

In [18]:
prompt = ChatPromptTemplate.from_template("""
## Role: World-Class Primary Care Physician

## Task: Generate Therapeutic Optimization Tasks for a PCP to review based on the patients profile, therapeutic optimisation questions, and current best practive guidelines.

## Guidelines:
**Data Inputs:
***Patient's Current Clinical Profile:*** Utilize the detailed information from the patient's most recent clinical profile over the last 12 weeks, which includes diagnoses, medications, and any abnormal labs. This data is provided under the variable {patient_recent_profile}.
***Clinical Questions:*** Consider the clinical questions generated from the previous interaction, which are intended to identify potential areas for therapeutic intervention. These are provided under the variable {therapeutic_optimisation_question}.
***Clinical Guidelines:*** Refer to the latest clinical guidelines returned from a semantic search. These guidelines are crucial for informing the creation of therapeutic optimization tasks and are provided under the variable {guidelines}.
***Objective:*** Craft detailed therapeutic optimization tasks based on the patient's current medical status and the insights gained from the clinical questions and guidelines. The tasks should specifically cater to the patient's current health needs and leverage the latest medical standards and practices to enhance treatment effectiveness and patient care.

**Details to Include:**
***Medications:*** Specify any recommended medications, including changes to existing prescriptions, with exact dosages and administration details based on the clinical guidelines. Include precise dosages (e.g., 40 mg twice daily), specific medication names (e.g., Atorvastatin instead of just stating 'statins'), and the duration for each prescribed medication (e.g., for 30 days).
***Therapeutic Interventions:*** Outline any suggested modifications or additions to the patient's treatment regimen, considering their current condition and the clinical insights provided. Detail the mode of administration (e.g., oral, intravenous), frequency (e.g., every 8 hours), and duration (e.g., 7 days). Include any adjunct therapies such as physical therapy sessions twice a week for six weeks.
***Monitoring Parameters:*** Define essential monitoring parameters to track the effectiveness of the new or adjusted treatments, specifying how often and which metrics should be monitored. Include specifics like blood pressure to be checked bi-weekly, and liver function tests to be done every month.

## Expected Output: A comprehensive list of therapeutic optimization tasks that are detailed and actionable. Each task should clearly address specific aspects of the patient's current health status, using the provided clinical questions and guidelines as a foundation to ensure tailored and effective patient care.
## Citations and Referencing: At the end of generated answer, always provide citations to accompany your claims. You can extract these from the "Title" in the retrieved context (guidelines). Provide the citations in this format: "Title"-StatsPearls.

## Patient's Current Clinical Profile:
{patient_recent_profile}

## Therapeutic Optimization Question to Consider:
{therapeutic_optimisation_question}

## Up to Date Clinical Guidelines to Inform Therapeutic Optimization Tasks for the Patient:
{guidelines}

**Focus that citations are extracted from retrieved context specifically from "Title" (found in the guidelines). If there is no "Title" than just write "No citation". Use only citations provided in guidelines (retrieved context)!**
""")

# Initializing retrieval
retriever = db.as_retriever(search_kwargs={"k": 1}) # Setting top k similarity

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4-0125-preview")



## Generating final response based on retrieved context

In [19]:
# Dictionary to store outputs
outputs = {}

# Looping through questions to perform RAG QA
for key, question in questions_dict.items():

    chain = (
    prompt
    | model
    | StrOutputParser()
    )

    retrieved_docs = retriever.invoke(str(question))

    response = chain.invoke({"guidelines": retriever.invoke(str(question)), "patient_recent_profile": {patient_recent_profile},
    "therapeutic_optimisation_question": {question}})

    # Storing retrieved text + llm response in a dictionary
    outputs[key] = {
        "retrieval": retrieved_docs,
        "answer": response
    }

In [21]:
# Initialize an empty string to store concatenated answers
all_answers = ""

# Loop through the outputs dictionary to concatenate the "answer" parts
for key in outputs:
    all_answers += outputs[key]["answer"] + "\n"  # Adding a newline for separation between answers

# Print the concatenated answers
print(all_answers)


### Therapeutic Optimization Tasks for the Patient

#### Medications

1. **Optimize Insulin Regimen:**
   - Continue with Insulin Glargine as the basal insulin. Given the patient's type 2 diabetes mellitus without complications, adjust the dose based on current blood glucose readings and trends. Aim for a fasting plasma glucose target of 80-130 mg/dL. Adjust the dose by 2-4 units every 3 days until target levels are achieved.
   - For Novolog (insulin aspart), adjust the bolus insulin dose based on carbohydrate intake, pre-meal blood glucose levels, and anticipated physical activity. Utilize a carbohydrate ratio starting at 1 unit of insulin per 15 grams of carbohydrates, adjusting based on postprandial glucose measurements.

2. **Add Metformin:**
   - Initiate Metformin to improve insulin sensitivity and lower hepatic glucose production. Start with 500 mg orally twice a day with meals, with the intention to increase to 1000 mg twice a day as tolerated. Monitor for gastrointestinal sid