In [1]:
import pandas as pd
import numpy as np 
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder


load_dotenv()



True

In [2]:
text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "==== Front", #signals new article
        "====",
        "\n\n",
    ],
    # Existing args
    chunk_size=2000,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)


In [3]:
with open("data/HPP_all_data.txt") as f:
    hpp_data = f.read()

In [4]:
## TODO pre-process to remove non-useful chunks, like references
## TODO add metadata based on chunk similarity

In [5]:
texts = text_splitter.create_documents([hpp_data])


In [6]:
texts[:0]

[]

In [7]:
texts[:10]

[Document(page_content='==== Front\nBr J Radiol\nBr J Radiol\nbjr\nThe British Journal of Radiology\n0007-1285\n1748-880X\nThe British Institute of Radiology.\n\n33684312\nBJR-D-20-01457\n10.1259/bjr.20201457\nReview Article\nbjrBJRint-vascMSK/Soft tissuesAtraumatic fractures of the femur\nAtraumatic fractures of the femur\nHedge et al\nHedge Ganesh 1doc.ganeshhegde@gmail.com\n\nThaker Siddharth 2siddharthnthaker@gmail.com\n\nBotchu Rajesh 3drbrajesh@yahoo.com\n\nFawcett Richard 2Richard.fawcett1@nhs.net\n\nhttps://orcid.org/0000-0002-8292-9096\nGupta Harun 2harun.gupta@nhs.net\n\n1 Department of Radiology, Royal Lancaster Infirmary, Lancaster, UK\n2 Department of Musculoskeletal Radiology, Leeds Teaching Hospitals, Leeds, UK\n3 Department of Musculoskeletal Radiology, Royal Orthopaedic Hospital, Birmingham, UK\nAddress correspondence to: Dr Harun Gupta. E-mail: harun.gupta@nhs.net\n01 5 2021\n18 3 2021\n18 3 2021\n94 1121 2020145719 12 2020\n24 2 2021\n01 3 2021\n© 2021 The Authors. P

In [8]:
vectorstore = Chroma.from_documents(documents=texts[:100], embedding=OpenAIEmbeddings()) #only embed the first 100 for now



In [9]:
vectorstore

<langchain_chroma.vectorstores.Chroma at 0x15c4b5ad0>

In [10]:
retriever = vectorstore.as_retriever(k=4)

In [11]:
docs = retriever.invoke("What is hypophosphatasia?")


In [12]:
docs

[Document(page_content='Deficient expression and/or activity of TNAP enzyme in humans causes a rare, heritable disorder called hypophosphatasia (HPP). Hypophosphatasia in humans has a broad range of severity that corresponds with timing of onset. Symptoms range from lethal and/or severe with perinatal or infantile onset (29), to modest and milder forms with childhood and adult onset (30, 31). Through a series of preclinical studies conducted on the Alpl−/− mouse model of infantile hypophosphatasia (TNAP−/− mice), a mineral-targeted recombinant form of TNAP was developed (32, 33). This recombinant enzyme was initially tested in patients with life threatening disease and was shown to improve respiratory function, skeletal mineralization and survival in patients with severe perinatal and infantile hypophosphatasia (34, 35). More recent results confirm safety and longer-term efficacy of this enzyme replacement therapy. In a 7 year follow up to the original trial, skeletal healing was susta

In [15]:
chat = ChatOpenAI(model="gpt-3.5-turbo-1106")

question_answering_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "Answer the user's questions based on the below context:\n\n{context}",
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

document_chain = create_stuff_documents_chain(chat, question_answering_prompt)

In [16]:
from langchain.memory import ChatMessageHistory

demo_ephemeral_chat_history = ChatMessageHistory()

demo_ephemeral_chat_history.add_user_message("What are the limitations of animal models of hypophosphatasia?")

document_chain.invoke(
    {
        "messages": demo_ephemeral_chat_history.messages,
        "context": docs,
    }
)

'The limitations of animal models of hypophosphatasia include the challenge of directly translating findings to human patients. While animal models, such as the TNAP null mice, can provide valuable insights into the disease, differences in physiology, genetics, and response to treatments between species can limit the direct applicability of findings to human patients. Additionally, the complexity of the disease in humans, with its broad range of severity and symptoms, may not be fully captured by animal models. Therefore, while animal models are important for studying disease mechanisms and testing potential treatments, the findings need to be carefully validated in human clinical studies.'