In [1]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

In [2]:
load_dotenv()
key = os.getenv("GOOGLE_API_KEY")
pdf_path = "data/nihms-1901028.pdf"

In [3]:
with open(pdf_path, "rb") as file:
    reader = PdfReader(file)
    text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
text

'High and normal protein diets improve body composition and \nglucose control in adults with type 2 diabetes: A randomized \ntrial\nJulianne G. Clina1, R. Drew Sayer1,3, Zhaoxing Pan2, Caroline W. Cohen3, Michael T. \nMcDermott4, Victoria A. Catenacci4, Holly R. Wyatt1,5, James O. Hill1\n1Department of Nutrition Sciences, University of Alabama at Birmingham\n2Department of Pediatrics, University of Colorado Anschutz Medical Campus\n3Department of Family and Community Medicine, University of Alabama at Birmingham\n4Division of Endocrinology, Metabolism and Diabetes, University of Colorado School of Medicine, \nAurora, Colorado\n5Anschutz Health and Wellness Center, University of Colorado Anschutz Medical Campus\nAbstract\nObjective:\xa0 Weight loss of ≥10% improves glucose control and may remit type 2 diabetes \n(T2D). High protein (HP) diets are commonly used for weight loss, but whether protein \nsources, especially red meat, impact weight loss-induced T2D management is unknown. This 

In [8]:
splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
chunks = splitter.split_text(text)
chunks

['High and normal protein diets improve body composition and \nglucose control in adults with type 2 diabetes: A randomized \ntrial\nJulianne G. Clina1, R. Drew Sayer1,3, Zhaoxing Pan2, Caroline W. Cohen3, Michael T. \nMcDermott4, Victoria A. Catenacci4, Holly R. Wyatt1,5, James O. Hill1\n1Department of Nutrition Sciences, University of Alabama at Birmingham\n2Department of Pediatrics, University of Colorado Anschutz Medical Campus\n3Department of Family and Community Medicine, University of Alabama at Birmingham\n4Division of Endocrinology, Metabolism and Diabetes, University of Colorado School of Medicine, \nAurora, Colorado\n5Anschutz Health and Wellness Center, University of Colorado Anschutz Medical Campus\nAbstract\nObjective:\xa0 Weight loss of ≥10% improves glucose control and may remit type 2 diabetes \n(T2D). High protein (HP) diets are commonly used for weight loss, but whether protein \nsources, especially red meat, impact weight loss-induced T2D management is unknown. This

In [6]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS

In [9]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_texts(chunks, embedding=embeddings)
vector_store.save_local("faiss_index")


In [47]:
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
import nest_asyncio
import weave
nest_asyncio.apply()


In [11]:
weave.init("medical-data-chatbot")

weave version 0.51.22 is available!  To upgrade, please run:
 $ pip install weave --upgrade
Logged in as Weights & Biases user: j-r-heierli.
View Weave data at https://wandb.ai/j-r-heierli-zhaw/medical-data-chatbot/weave


<weave.trace.weave_client.WeaveClient at 0x12fc8ae40>

In [12]:
retriever = vector_store.as_retriever(k=4)

In [20]:
docs = retriever.invoke("What is the difference between high and medium protein-based diets?")
docs

[Document(metadata={}, page_content='High and normal protein diets improve body composition and \nglucose control in adults with type 2 diabetes: A randomized \ntrial\nJulianne G. Clina1, R. Drew Sayer1,3, Zhaoxing Pan2, Caroline W. Cohen3, Michael T. \nMcDermott4, Victoria A. Catenacci4, Holly R. Wyatt1,5, James O. Hill1\n1Department of Nutrition Sciences, University of Alabama at Birmingham\n2Department of Pediatrics, University of Colorado Anschutz Medical Campus\n3Department of Family and Community Medicine, University of Alabama at Birmingham\n4Division of Endocrinology, Metabolism and Diabetes, University of Colorado School of Medicine, \nAurora, Colorado\n5Anschutz Health and Wellness Center, University of Colorado Anschutz Medical Campus\nAbstract\nObjective:\xa0 Weight loss of ≥10% improves glucose control and may remit type 2 diabetes \n(T2D). High protein (HP) diets are commonly used for weight loss, but whether protein \nsources, especially red meat, impact weight loss-indu

In [38]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

system_template = """
Answer the users question based on the below context:
<context> {context} </context>
"""
question_answering_prompt = PromptTemplate(template=system_template, input_variables=["context"])

model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest", temperature=0.5)
document_chain = create_stuff_documents_chain(llm=model, prompt=question_answering_prompt)


In [24]:
from langchain_core.messages import HumanMessage

document_chain.invoke(
    {
        "context": docs,
        "messages": [
            HumanMessage(content="What is the difference between high and medium protein-based diets?")
        ],
    }
)

'**Question:** What is the effect of a high protein diet including red meat and a normal protein diet excluding red meat on weight loss and improved cardiometabolic health?\n\n**Answer:** The lack of observed effects of dietary protein and red meat consumption on weight loss and improved cardiometabolic health suggest that achieved weight loss – rather than diet composition – should be the principal target of dietary interventions for T2D management.'

In [26]:
from typing import Dict
from langchain_core.runnables import RunnablePassthrough


def parse_retriever_input(params: Dict):
    return params["messages"][-1].content


retrieval_chain = RunnablePassthrough.assign(
    context=parse_retriever_input | retriever,
).assign(
    answer=document_chain,
)

In [27]:
retrieval_chain.invoke(
    {
        "messages": [
            HumanMessage(content="What is the difference between high and medium protein-based diets?")
        ],
    }
)

{'messages': [HumanMessage(content='What is the difference between high and medium protein-based diets?', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={}, page_content='High and normal protein diets improve body composition and \nglucose control in adults with type 2 diabetes: A randomized \ntrial\nJulianne G. Clina1, R. Drew Sayer1,3, Zhaoxing Pan2, Caroline W. Cohen3, Michael T. \nMcDermott4, Victoria A. Catenacci4, Holly R. Wyatt1,5, James O. Hill1\n1Department of Nutrition Sciences, University of Alabama at Birmingham\n2Department of Pediatrics, University of Colorado Anschutz Medical Campus\n3Department of Family and Community Medicine, University of Alabama at Birmingham\n4Division of Endocrinology, Metabolism and Diabetes, University of Colorado School of Medicine, \nAurora, Colorado\n5Anschutz Health and Wellness Center, University of Colorado Anschutz Medical Campus\nAbstract\nObjective:\xa0 Weight loss of ≥10% improves glucose control and may r

In [None]:
retrieval_chain.invoke(
    {
        "messages": [
            HumanMessage(content="What is the difference between high and medium protein-based diets?")
        ],
    }
)

In [28]:
retriever.invoke("Tell me more!")

[Document(metadata={}, page_content='through medical records or doctor reports, blood biomarkers were confirmed via a blood \ntest at the screening visit, and all other criteria were confirmed by self-report. The study \nprotocol was reviewed and approved the Institutional Review Boards at the University of \nAlabama at Birmingham and University of Colorado Anschutz Medical Campus. The study \nwas registered on clinicaltrials.gov  as NCT03832933 .\nExperimental Design\nAll participants followed the State of Slim (SOS) weight management program for the first \n16 weeks of the program, which consisted of weekly group classes led by a trained coach. \nParticipants received copies of the SOS book, copies of the course materials, and access \nto the online community. After the first 16 weeks, participants participated in the SOS \nNext Steps program which consists of 18 bi-weekly group classes for the remainder of the Clina et al. Page 3\nObesity (Silver Spring) . Author manuscript; availab

In [29]:
from langchain_core.messages import AIMessage, HumanMessage

query_transform_prompt = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="messages"),
        (
            "user",
            "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation. Only respond with the query, nothing else.",
        ),
    ]
)

In [30]:
query_transformation_chain = query_transform_prompt | model

In [32]:
query_transformation_chain.invoke(
    {
        "messages": [
            HumanMessage(content="What is the difference between high and medium protein-based diets?"),
            AIMessage(
                content="he study found that both high and normal protein diets improved body composition and glucose control in adults with type 2 diabetes. The lack of observed effects of dietary protein and red meat consumption on weight loss and improved cardiometabolic health suggest that achieved weight loss – rather than diet composition – should be the principal target of dietary interventions for T2D management."
            ),
            HumanMessage(content="Tell me more!"),
        ],
    }
)

AIMessage(content='Dietary protein and red meat consumption in type 2 diabetes management', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE', 'blocked': False}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE', 'blocked': False}]}, id='run-fb6c22ac-7a13-44fd-be95-9681dab57b9d-0', usage_metadata={'input_tokens': 122, 'output_tokens': 12, 'total_tokens': 134, 'input_token_details': {'cache_read': 0}})

In [33]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableBranch

query_transforming_retriever_chain = RunnableBranch(
    (
        lambda x: len(x.get("messages", [])) == 1,
        # If only one message, then we just pass that message's content to retriever
        (lambda x: x["messages"][-1].content) | retriever,
    ),
    # If messages, then we pass inputs to LLM chain to transform the query, then pass to retriever
    query_transform_prompt | model | StrOutputParser() | retriever,
).with_config(run_name="chat_retriever_chain")

In [40]:
SYSTEM_TEMPLATE = """
Answer the user's questions based on the below context. 
If the context doesn't contain any relevant information to the question, don't make something up and just say "I don't know":

<context>
{context}
</context>
"""

question_answering_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            SYSTEM_TEMPLATE,
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

document_chain = create_stuff_documents_chain(model, question_answering_prompt)

conversational_retrieval_chain = RunnablePassthrough.assign(
    context=query_transforming_retriever_chain,
).assign(
    answer=document_chain,
)

In [41]:
conversational_retrieval_chain.invoke(
    {
        "messages": [
            HumanMessage(content="Can LangSmith help test my LLM applications?"),
        ]
    }
)

{'messages': [HumanMessage(content='Can LangSmith help test my LLM applications?', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={}, page_content='575–583. [PubMed: 21070685] \n22. Barnard N, Levin S, and Trapp C, Correction: Barnard N, et al. Meat Consumption as a Risk Factor \nfor Type 2 Diabetes. Nutrients 2014, 6, 897–910. Nutrients, 2014. 6(10): p. 4317–4319. [PubMed: \n24566443] \n23. Pan A, et al. , Changes in red meat consumption and subsequent risk of type 2 diabetes mellitus: \nthree cohorts of US men and women. JAMA internal medicine, 2013. 173(14): p. 1328–1335. \n[PubMed: 23779232] \n24. Sayer RD, et al. , Dietary Approaches to Stop Hypertension diet retains effectiveness to reduce \nblood pressure when lean pork is substituted for chicken and fish as the predominant source \nof protein. The American journal of clinical nutrition, 2015. 102(2): p. 302–308. [PubMed: \n26063693] \n25. Sayer R, et al. , Equivalent reductions in body weight durin

In [42]:
conversational_retrieval_chain.invoke(
    {
        "messages": [
            HumanMessage(content="What is the difference between high and medium protein-based diets?"),
            AIMessage(
                content="he study found that both high and normal protein diets improved body composition and glucose control in adults with type 2 diabetes. The lack of observed effects of dietary protein and red meat consumption on weight loss and improved cardiometabolic health suggest that achieved weight loss – rather than diet composition – should be the principal target of dietary interventions for T2D management."
            ),
            HumanMessage(content="Tell me more!"),
        ],
    }
    )

{'messages': [HumanMessage(content='What is the difference between high and medium protein-based diets?', additional_kwargs={}, response_metadata={}),
  AIMessage(content='he study found that both high and normal protein diets improved body composition and glucose control in adults with type 2 diabetes. The lack of observed effects of dietary protein and red meat consumption on weight loss and improved cardiometabolic health suggest that achieved weight loss – rather than diet composition – should be the principal target of dietary interventions for T2D management.', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='Tell me more!', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={}, page_content='575–583. [PubMed: 21070685] \n22. Barnard N, Levin S, and Trapp C, Correction: Barnard N, et al. Meat Consumption as a Risk Factor \nfor Type 2 Diabetes. Nutrients 2014, 6, 897–910. Nutrients, 2014. 6(10): p. 4317–4319. [PubMed: \n24566443] \n23.

In [50]:
@weave.op()
async def get_answer(question: str, messages: dict):
    messages["messages"].append(HumanMessage(content=question))
    answer = conversational_retrieval_chain.invoke(messages)
    messages["messages"].append(AIMessage(content=answer["answer"]))
    return answer["answer"]

In [52]:
import asyncio
from langchain.globals import set_debug

set_debug(True)
messages = {"messages": []} 
print(asyncio.get_event_loop().run_until_complete(get_answer("What is the difference between high and medium protein-based diets?", messages))) 

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<context>] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<context> > chain:RunnableParallel<context>] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:chat_retriever_chain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:chat_retriever_chain > chain:RunnableLambda] Entering Chain run with input:
[0m[inputs]
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableAssign<context> > chain:RunnableParallel<context> > chain:chat_retriever_chain > chain:RunnableL