## Importing Necessary Libraries

In [None]:
# !pip install biopython

* If not installed, uncomment the line and install the library

In [None]:
# These libraries needs to reinstalled every time when running in colab environment
# Run this cell and restart & run all when running on colab
%%capture --no-stderr
%pip install numpy==1.26.4 langchain-community langchain-openai langchain-chroma gradio-client gradio==3.38.0 language_tool_python xmltodict
!apt-get update -qq
!apt-get install -qq openjdk-17-jdk-headless

In [None]:
# Library to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Web scraping and HTML parsing
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests

# File and system operations
import os

# Time-related functions
import time

# LangChain libraries for natural language processing
from langchain_community.document_loaders import WebBaseLoader  # Loading documents from the web
from langchain_text_splitters import RecursiveCharacterTextSplitter  # Text splitting methods
from langchain_openai import ChatOpenAI, OpenAIEmbeddings  # Integrating with OpenAI for LLM and embeddings
from langchain_chroma.vectorstores import Chroma  # Vector store operations
from langchain.memory import ConversationBufferMemory  # Memory management for conversational agents
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder  # Templates and placeholders for prompts
from langchain_core.messages import HumanMessage, AIMessage # Designating Human Message and AI Message
from langchain_community.retrievers import PubMedRetriever # A pre-built retriever based on The National Center for Biotechnology Information and National Library of Medicine
from langchain.chains.combine_documents import create_stuff_documents_chain  # Document processing chain creation
from langchain.chains import create_history_aware_retriever, create_retrieval_chain  # Creating retrieval chains
from langchain.agents import Tool, AgentType, initialize_agent  # Agent and tool initialization

# Spelling/Grammar checker tool
import language_tool_python # Grammar & typo checker

# Gradio for creating UI components
import gradio as gr

# Google Drive integration
from google.colab import drive
drive.mount('/content/drive')




Mounted at /content/drive


## Working Environment Set Up

In [None]:
# Setting base path of the n
base_path = '/content/drive/MyDrive/Project/Project/DEMO_APP_Medicine_info/notebooks/' # <- change this to your own base path where you keep the notebook and constant.py
os.chdir(base_path)

In [None]:
# Importing API keys and tokens from constant.py file
from constant import (OPENAI_API_KEY, LANGSMITH_API_KEY, HF_TOKEN)

# Setting environment variables for APIs and other configurations
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY  # Assign OpenAI API key to environment variable
os.environ["LANGSMITH_TRACING"] = "true"  # Enable tracing for Langsmith
os.environ['LANGSMITH_API_KEY'] = LANGSMITH_API_KEY  # Assign Langsmith API key to environment variable
os.environ['HF_TOKEN'] = HF_TOKEN  # Assign Hugging Face token to environment variable
os.environ['USER_AGENT'] = 'Mozilla/5.0'  # Set custom user agent for web requests

# Printing the first 5 characters of each API key for verification
print(OPENAI_API_KEY[:5])  # Print the first 5 characters of the OpenAI API key
print(LANGSMITH_API_KEY[:5])  # Print the first 5 characters of the Langsmith API key

sk-pr
lsv2_


## Extract the Documents from Cancer.gov

In [None]:
# Function to get sitemap from the url

def get_sitemap(url):

    try:
        # Make a request to the website for sitemap
        req = Request(
            url = url,
            # The headers for the request, specifying a custom User-Agent
            headers={'User-Agent': 'Mozilla/5.0'}
        )
        # Returns HTTP response object
        response = urlopen(req)

        # Get the character encoding from the response headers, default to utf-8 if not specified
        encoding = response.headers.get_content_charset('utf-8')

        # Read the response data and decode it using the detected ecnoding
        xml_data = response.read().decode(encoding)

        # Parse the XML data to make it easier to work
        xml_soup = BeautifulSoup(xml_data, "xml")

        # Return parsed XML data
        return xml_soup

    except Exception as e:
        print(f"Error fetching sitemap: {e}")
        return None

In [None]:
# Function to get xmls related to breast cancer

def get_urls(xml):
  # Empty list to store urls
  urls = []
  # Iterate through each url tag in XML
  for url in xml.find_all('url'):
    # Extract the text (url) from the loc tag
    if url.find('loc'):
      # Check if the url contains the world breast and not video
      loc = url.find_next('loc').text
      if 'breast' in loc and 'video' not in loc:
        # Add the URLs into the list if conditions are met
        urls.append(loc)

  # Return urls
  return urls

In [None]:
# Retrieve documents with breast cancer
url = "https://www.cancer.gov/sitemaps/pageinstructions.xml"
xml = get_sitemap(url)
urls = get_urls(xml)

In [None]:
# Categorize urls based on file type

# Initialize an empty dictionary with rooms for different formats
categorized_urls = {
    "pdf": [],
    "epub": [],
    "mobi": [],
    "html": [],
}

# Iterate through the list and store the urls with the corresponding formats
for url in urls:
    if url.endswith(".pdf"):
        categorized_urls["pdf"].append(url)
    elif url.endswith(".epub"):
        categorized_urls["epub"].append(url)
    elif url.endswith(".mobi"):
        categorized_urls["mobi"].append(url)
    else:
        categorized_urls["html"].append(url)

In [None]:
# Find the PDF files from the HTLML
pdf_urls = []

for url in categorized_urls['html']:
    try:
        response = requests.get(url, timeout = 10)
        content_type = response.headers.get("Content-Type", "")

        if "pdf" in content_type.lower():
            pdf_urls.append(url)
            print(f"Found PDF: {url}")

        time.sleep(1)

    except Exception as e:
        print(f" Error fetching {url}: {e}")

Found PDF: https://www.cancer.gov/about-nci/organization/ccct/steering-committees/nctn/breast-cancer/nciboldradiationcdesrev
Found PDF: https://www.cancer.gov/grants-training/training/resources-trainees/courses-fellowships/2021-traco-breast-cancer
Found PDF: https://www.cancer.gov/grants-training/training/resources-trainees/courses-fellowships/traco-breast-cancer
Found PDF: https://www.cancer.gov/grants-training/training/resources-trainees/courses-fellowships/traco-breast-cancer-2023


In [None]:
# Remove those findings from HTML and add them to PDF
categorized_urls["html"] = [url for url in categorized_urls["html"] if url not in pdf_urls]
categorized_urls["pdf"].extend(pdf_urls)

## Retriever Set Up: Langchain

LangChain is a framework for developing applications powered by large language models (LLMs).

In [None]:
# Load the documents from the extracted urls

# Initialize an empty list to store web documents
docs = []

# Check the loading process
for i, url in enumerate(categorized_urls['html']):
  loader = WebBaseLoader(url).load()
  docs.extend(loader)
  if i != 0 and i % 10 == 0:
    print(f'{i} html documents are loaded')

print('All html documents are loaded')

10 html documents are loaded
20 html documents are loaded
30 html documents are loaded
40 html documents are loaded
50 html documents are loaded
60 html documents are loaded
70 html documents are loaded
80 html documents are loaded
All html documents are loaded


* All documents are successfully loaded!

In [None]:
# Initialize the text splitter to break the documents into smaller chuncks
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

# Split the documents for manageable chuncks for efficient processing
documents = text_splitter.split_documents(docs)

# Initialize OPENAI's embedding model
embedding = OpenAIEmbeddings()

# Create a vector database with documents and embeddings
vector_db = Chroma.from_documents(documents = documents, embedding = embedding)

**Vector Database Workflow**

1. Load Source Data
* Source data can be various formats including documents (text, HTML, CSV) or media files.
* The data is loaded, transformed, and embedded.
* Embedding process is where the textual human language are converted in numerical vectors (embeddings) for the machine to understand.


**2. Query the Vector Store**
* After the source data is processed and stored, a query is made from the user, which is typically texts related to the user is looking for.
* The query is embedded and the resulting vector is used to search for the most similar vectors from the vector store.


**3. Retrieve the most similar result**
* Techniques like cosine similarity or euclidean distance is used to retrieve the vectors that are most similar to to the query vector.

In [None]:
# A simple test to search vectors to find 3 most relevant documents for the input
vector_db.similarity_search('risk factor', k = 3)

[Document(id='6284dac6-03c4-4fb7-9a09-15b3c9b237fe', metadata={'description': 'Risk factors for breast cancer are female sex and advancing age, inherited risk, breast density, obesity, alcohol consumption, and exposure to ionizing radiation. Interventions to prevent breast cancer include chemoprevention (e.g. SERMs, AIs), risk-reducing surgery (e.g. mastectomy, oophorectomy). Review the evidence on risk factors and interventions to prevent breast cancer in this expert-reviewed summary.', 'language': 'en', 'source': 'https://www.cancer.gov/types/breast/hp/breast-prevention-pdq', 'title': 'Breast Cancer Prevention (PDQ®) - NCI'}, page_content='Inherited Risk'),
 Document(id='1138adf9-4491-49bb-94a7-d6a4ad0b77dc', metadata={'description': 'Breast cancer screening is performed using mammogram, clinical breast exam (CBE), and MRI (magnetic resonance imaging) tests. Learn about these and other tests that have been studied to detect or screen for breast cancer in this expert-reviewed and evid

In [None]:
# Convert the vector database into a retriever for querying
retriever = vector_db.as_retriever()

In [None]:
# Create Question-Answer Chain Pipeline

# Prompt for the system
system_prompt = (
    """
    You are an assistant in question-answering tasks.
    Provide answers using the retrieved context.
    If there is no relevent context for the question,
    Simply state, 'No response available'.
    Be brief and stick to the key points, and try to
    use the ordered numeric format if you can.
    The number of items does not matter.
    Ask follow-up questions if the question is incomplete or not clear
    \n\n
    {context}
    """
)

# Prompt Template
prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}')
    ]
)

# LLM - setting temperature to 0 makes the llm to not be creative but to stick to the contents as possible
llm = ChatOpenAI(model = 'gpt-4o-mini', temperature = 0.0)

# QA Chain
qa_chain = create_stuff_documents_chain(llm, prompt)

* This is a question-answering chain that can answer user inputs, however this does not have any of our documents in the system, so we build a RAG system that has our internal data for reference.

## RAG

**RAG (Retrieval Augmented Generation)** is one the core techniques in Gen AI that enhances the capability of LLMs by incorporating them with external knowledge bases (documentations from cancer.gov in our case)

However, RAG does not have ability to manually save the chat history so we need to build a retriever that is capable of history awareness.

For example, if the user asks RAG "What are the top 10 risk factors for breast cancer" first and ask follow-up question , then it will not realize the contextual meaning of what user needs more.

Therefore, it is critical to build a history aware retriever for smooth conversation-like AI workflow.

In [None]:
# Create Retriever Chain Pipeline with Chat History

# History System Prompt
history_system_prompt = (
    """
    Given chat history and the latest user input, formulate a
    standalone question that can be understood without referencing
    chat history.

    This is NOT telling you to generate answers, but to reformulate
    the questions or return them as they are.
    \n\n
    {{context}}
    """
)

# History Prompt Template
history_prompt = ChatPromptTemplate.from_messages(
    [
        ('system', history_system_prompt),
        MessagesPlaceholder('chat_history'),
        ('human', '{input}')
    ]
)

# History Aware Retriever
history_aware_retriever = create_history_aware_retriever(llm, retriever, history_prompt)

# RAG Chain incoporated with HAR
rag_chain = create_retrieval_chain(history_aware_retriever, qa_chain)

In [None]:
# Test if RAG works

# Temporary list to store chat history
test_history = []

def rag_test():

  while True:
    # Ask whatever question you want to ask regarding breast cancer
    question = input('Enter your question: ').strip()

    # Enter q or quit to get out of the loop
    if question.lower() in ['q', 'quit']:
      print('Exiting Chat, Goodbye!')
      break

    # LLM's response to the user question
    response = rag_chain.invoke({'input': question, 'chat_history': test_history})

    # Add chat history to the list, wil create an answer when it finds relevant documents
    # It will say No reponse available if no relevant documents are found
    test_history.extend(
        [
            HumanMessage(content = question),
            AIMessage(content = response.get('answer', 'No response available'))
        ]
    )

    print('\nAnswer:\n')
    print(response.get('answer', 'No response available'))
    print('-' * 70)
    print('\n')

rag_test()

Enter your question: What are the risk factors of breast cancer?

Answer:

The risk factors for breast cancer include:

1. **Personal History**: 
   - Invasive breast cancer
   - Ductal carcinoma in situ (DCIS)
   - Lobular carcinoma in situ (LCIS)
   - Benign breast disease

2. **Family History**: 
   - Breast cancer in a first-degree relative (mother, daughter, sister)

3. **Genetic Factors**: 
   - Inherited changes in BRCA1 or BRCA2 genes or other genes that increase risk

4. **Breast Density**: 
   - Dense breast tissue on a mammogram

5. **Hormonal Exposure**: 
   - Early menarche
   - Older age at first birth or nulliparity (never having given birth)
   - Late menopause
   - Use of combination estrogen-progesterone hormones after menopause

6. **Radiation Exposure**: 
   - Especially during puberty or young adulthood

7. **Lifestyle Factors**: 
   - Alcohol consumption
   - Smoking
   - Physical inactivity

8. **Other Factors**: 
   - Treatment with radiation
   - Hormone replac

As it can be seen with the test, the user and AI can have conversation-like workflow with the help of history aware retriever.

## Tools

Tools are crucial element in Generative AI that enables AI agents to improve their performance with higher accuracy and efficiency for specific tasks.

Tools can be custom-made with clear instructions from the user or can use tools that are already built for particular tasks.

In [None]:
# Create function to use rag chain as agent's tool
def domain_info(user_input: str, memory:ConversationBufferMemory) -> str:
  """
  Params:

  user_input (str): User question related to breast cancer
  memory: ConversationBufferMemory: A key component to build conversation AI Chatbot system
                                    to help generate contextually-aware responses

  Expected Output:

  Response to the user input based on the internal rag or 'No response availabe' if no relevant docs.

  """
  # Refernce of full chat history
  full_chat_history = memory.load_memory_variables({})['chat_history']

  # Result of the RAG based on user input and chat history
  result = rag_chain.invoke(
      {'input': user_input, 'chat_history': full_chat_history}
  )

  # Answer part of the AI response without metadata
  answer = result['answer']

  # Save the chat history to the memory
  memory.save_context(
      {'input': user_input},
      {'output': answer}
  )

  # Print out the answer
  print(answer)
  return answer

In [None]:
# Set Up global memory for chat history reference
global_memory = ConversationBufferMemory(
    memory_key = 'chat_history',
    return_messages = True,
    input_key = 'input',
    output_key = 'output'
)

  global_memory = ConversationBufferMemory(


In [None]:
# Initialize PubMed Retriever for extracting external data
pubmed_retriever = PubMedRetriever(search_term = "breast cancer")

def pubmed_tool_fn(context: str) -> str:
    """
    Params:
    context (str): response from the 1st agent, either well-structured response or 'No response available'.

    Expected Output:
    An enriched, more profound answer than the provided context with additional documents from PubMed,
    or a new answer if the context is 'No response available'.
    """

    # Retrieve 5 PubMed docs
    docs = pubmed_retriever.get_relevant_documents(context)[:5]

    # Return their content
    answer = "\n\n".join(d.page_content for d in docs)

    # Save the chat history to the memory
    global_memory.save_context(
        {"input": context},
        {"output": answer}
    )

    return answer

In [None]:
# Create a RAG tool for Agent with RAG Task
domain_rag = Tool(
    name = 'DomainRAGInfo', # name of the tool
    func = lambda user_input: domain_info(user_input, global_memory), # execution function
    description = # description of the task
    """
    Use this tool for questions about breast cancer from the internal knowledge base that may
    rely on previous conversation.
    """
)

# Set up tool for PubMed retrieval in agent
pubmed_tool = Tool(
    name = "PubMedRetriever", # name of the tool
    func = pubmed_tool_fn, # execution function
    description = # description of the task
    """
    Provide more information from PubMed documents to make the answers more profound and enriched.
    If the context you get is 'No response available', create a standalone answer.
    """
)

## Agents

* Agents are highly capable & crucial hands helping the LLMs to generate response for the user query using the pre-defined tools.

In [None]:
# Create RAG Agent
domain_rag_agent = initialize_agent(
    tools = [domain_rag], # tool for usage
    llm = llm, # LLM (gpt-4o-min with 0 temperature)
    agent = AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,  # Agent type for chat with memory and ReAct tool selection
    memory = global_memory, # ConversationBufferMemory instance to store chat history
    verbose = True # Enable verbose logging of internal thought/action steps
)

# Create Pubmed Agent
pubmed_agent = initialize_agent(
    tools = [pubmed_tool], # tool for usage
    llm = llm, # LLM (gpt-4o-min with 0 temperature)
    agent = AgentType.ZERO_SHOT_REACT_DESCRIPTION,  # One-shot agent type for single-turn tool invocation
    memory = global_memory,# ConversationBufferMemory instance to store chat history
    verbose = True # Enable verbose logging of internal thought/action steps
)

  domain_rag_agent = initialize_agent(


In [None]:
# Two Agent WorkFlow Pipeline
def two_agent_pipeline(user_input: str) -> str:
    """
    This is a pipeline for two agents to work in sequential process.
    """
    # Answer generated by 1st Agent
    rag_answer = domain_rag_agent.run(user_input)

    # If the answer from RAG is 'No response available', use the user_input as context
    # Else, use the RAG answer as the context
    if rag_answer == 'No response available':
        context = user_input  # If no valid response, pass the original user input as context
    else:
        context = rag_answer  # Use the RAG answer as context

    # Pass the context to the PubMed tool function
    final_answer = pubmed_agent.run(context)


    return final_answer

In [None]:
# Initialize the grammar & spelling checker, set language to English
tool = language_tool_python.LanguageTool('en-US')

# A function to fix grammar or typos in applciation input
def correct_input(user_input: str) -> str:
    """
    Runs a quick grammar/spell check on user_input and returns the corrected version.
    """
    matches = tool.check(user_input)
    return language_tool_python.utils.correct(user_input, matches)

Downloading LanguageTool latest: 100%|██████████| 252M/252M [00:11<00:00, 21.8MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpcjd8twsj.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


In [None]:
# Combine the ultimate function for our application
def breast_cancer_fn(user_input, chat_history):
    """
    Params:

    user_input (str): User question related to breast cancer
    memory: ConversationBufferMemory: A key component to build conversation AI Chatbot system
                                    to help generate contextually-aware responses

    Expected Output:

    Response to the user input based on the internal rag, enriched or newly written with Pubmed documents
    with corrected spelling or grammar if there any.
    """

    # Pre‐correct typos/grammar
    corrected = correct_input(user_input)

    if corrected != user_input:
      user_input = corrected

    # Run two agent pipe-line with the user input
    response = two_agent_pipeline(user_input)
    chat_history.append((user_input, response))
    return chat_history, "" # chat_history: updates the visible chat history & "": clears the text box content once user press enter

In [None]:
# Gradio Demo Breast Cancer-QA chatbot Interface

with gr.Blocks() as demo:
    # Title
    gr.Markdown('Cancer-QA BOT')

    # Chat window
    chatbot = gr.Chatbot(type='messages')

    # Input box for questions
    user_box = gr.Textbox(
        placeholder='Ask any question about breast cancer',
        show_label=False
    )

    # Button to clear chat
    clear_btn = gr.Button('Clear Chat')

    # On Enter: run the main function, update chat, clear input
    user_box.submit(
        fn = breast_cancer_fn,
        inputs = [user_box, chatbot],
        outputs = [chatbot, user_box]
    )

    # Function to clear memory and chat history
    def clear_memory():
        global_memory.clear()
        return []  # empty history

    # Link clear button to clear function
    clear_btn.click(
        clear_memory,
        inputs = [],
        outputs = [chatbot],
        queue = False
    )

# Start the app
demo.launch(debug = True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

IMPORTANT: You are using gradio version 3.38.0, however version 4.44.1 is available, please upgrade.
--------


  rag_answer = domain_rag_agent.run(user_input)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "DomainRAGInfo",
}
```[0m1. Breast lump or firm feeling (mass) in or near the breast or under the arm.
2. Nipple changes or discharge (fluid that is not breast milk).
3. Skin changes on the breast(s) such as redness, swelling, dimples, or puckers.
4. Rapid onset of erythema (redness) and edema (swelling) with abnormal breast warmth.
5. Changes in the size or shape of the breast.

If you notice any of these changes, consult your doctor or nurse.

Observation: [36;1m[1;3m1. Breast lump or firm feeling (mass) in or near the breast or under the arm.
2. Nipple changes or discharge (fluid that is not breast milk).
3. Skin changes on the breast(s) such as redness, swelling, dimples, or puckers.
4. Rapid onset of erythema (redness) and edema (swelling) with abnormal breast warmth.
5. Changes in the size or shape of the breast.

If you notice any of these changes, consult your doctor or nurse.[0m
Thought:[

  docs = pubmed_retriever.get_relevant_documents(context)[:5]


Too Many Requests, waiting for 0.20 seconds...
Too Many Requests, waiting for 0.40 seconds...

Observation: [36;1m[1;3mAIM: The aim of this study was to evaluate the early diagnostic potential of various serum biomarkers and ultrasound characteristics in girls diagnosed with early central precocious puberty (CPP).
METHODS: A cohort of 125 girls presenting breast development was assessed between May 2020 and January 2023. Following a six-month follow-up and GnRH agonist stimulation test, 78 girls were classified into the CPP group and 47 into the premature thelarche (PT) group. Serum biomarkers, including insulin-like growth factor-binding protein 3 (IGFBP-3), insulin-like growth factor 1 (IGF-1), and dehydroepiandrosterone sulfate (DHEAs), as well as bone age index (BAI) and ultrasound features, were compared between the groups.
RESULTS: Chronological age did not significantly differ between the groups, but bone age and BAI were notably higher in the CPP group. Most serum levels, exc