In [1]:
import pandas as pd
import numpy as np 
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder

from pypdf import PdfReader
import os

import ollama
import chromadb

load_dotenv()



True

In [2]:
reader = PdfReader("pdf_data/AFS_p1.pdf")
number_of_pages = len(reader.pages)
page = reader.pages[1]
text = page.extract_text()

In [3]:
directory = 'pdf_data'
texts = []

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        text = ''
        reader = PdfReader(f)
        number_of_pages = len(reader.pages)
        for i in range(number_of_pages):
            page = reader.pages[i]
            text += page.extract_text()
        texts.append(text)



In [4]:
texts

["Original Full Length Article\nOverexpression of tissue-nonspeci ﬁc alkaline phosphatase increases the\nexpression of neurogenic differentiation markers in the human SH-SY5Y\nneuroblastoma cell line\nStephanie Grasera,B i r g i tM e n t r u pa, Doris Schneidera, Ludger Klein-Hitpassb,\nFranz Jakoba,1, Christine Hofmannc,⁎,1\naOrthopedic Department, Orthopedic Center for Musculoskeletal Research, University of Wuerzburg, Germany\nbInstitute of Cell Biology, Faculty of Medicine, University of Duisburg-Essen, Germany\ncChildren's Hospital, Section of Pediatric Rheumatology and Osteology, University of Wuerzburg, Germany\nabstract article info\nArticle history:\nReceived 22 January 2015Revised 24 April 2015\nAccepted 23 May 2015\nAvailable online 29 May 2015\nEdited by: Nuria Guanabens\nKeywords:\nHypophosphatasia\nCellular processes\nOverexpressionTransgenicMAP2Patients suffering from the rare hereditary disease hypophosphatasia (HPP), which is based on mutations in the\nALPL gene, tend 

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    length_function=len,
)


In [6]:
text_pdfs = text_splitter.create_documents(texts)


In [7]:
text_pdfs[:10]

[Document(page_content="Original Full Length Article\nOverexpression of tissue-nonspeci ﬁc alkaline phosphatase increases the\nexpression of neurogenic differentiation markers in the human SH-SY5Y\nneuroblastoma cell line\nStephanie Grasera,B i r g i tM e n t r u pa, Doris Schneidera, Ludger Klein-Hitpassb,\nFranz Jakoba,1, Christine Hofmannc,⁎,1\naOrthopedic Department, Orthopedic Center for Musculoskeletal Research, University of Wuerzburg, Germany\nbInstitute of Cell Biology, Faculty of Medicine, University of Duisburg-Essen, Germany\ncChildren's Hospital, Section of Pediatric Rheumatology and Osteology, University of Wuerzburg, Germany\nabstract article info\nArticle history:\nReceived 22 January 2015Revised 24 April 2015\nAccepted 23 May 2015\nAvailable online 29 May 2015\nEdited by: Nuria Guanabens\nKeywords:\nHypophosphatasia\nCellular processes\nOverexpressionTransgenicMAP2Patients suffering from the rare hereditary disease hypophosphatasia (HPP), which is based on mutations in

In [8]:
client = chromadb.Client()
collection = client.create_collection(name="docs")

# store each document in a vector embedding database
for i, d in enumerate(text_pdfs):
  response = ollama.embeddings(model="mxbai-embed-large", prompt=d.page_content)
  embedding = response["embedding"]
  collection.add(
    ids=[str(i)],
    embeddings=[embedding],
    documents=[d.page_content]
  )

In [9]:
# an example prompt
prompt = "What is hypophosphatasia?"

# generate an embedding for the prompt and retrieve the most relevant doc
response = ollama.embeddings(
  prompt=prompt,
  model="mxbai-embed-large"
)
results = collection.query(
  query_embeddings=[response["embedding"]],
  n_results=1
)
data = results['documents'][0][0]

In [11]:
output = ollama.generate(
  model="llama3",
  prompt=f"Using this data: {data}. Respond to this prompt: {prompt}"
)

print(output['response'])

According to the National Organization for Rare Disorders (NORD), Familial Hypophosphatemia is a rare genetic disorder characterized by deficient activity of the enzyme alkaline phosphatase, which plays a crucial role in maintaining proper levels of phosphorus in the body. 

Hypophosphatasia, also known as "Pseudohypoparathyroidism", is a condition where the production and secretion of parathyroid hormone (PTH) are normal, but there is a deficiency of alkaline phosphatase activity, leading to abnormally low levels of inorganic phosphate (Pi) in the blood. This disorder can manifest with symptoms such as osteosclerosis, bone resorption, and rickets-like conditions.

In summary, hypophosphatasia refers to a rare genetic disorder characterized by deficient alkaline phosphatase enzyme activity, resulting in abnormally low levels of inorganic phosphate in the blood, which can lead to various skeletal manifestations.


In [12]:
data

'Additional Background Sourcing on Familial Hypophosphatemia  \n \nFamilial Hypophosphatemia - NORD (National Organization for Rare Disorders). NORD (National \nOrganization for Rare Disorders) . 2017. Available at: https://rarediseases.org/rare-diseases/familial-\nhypophosphatemia/. Accessed March 24, 2017. \nhttps://rarediseases.org/rare-diseases/familial-hypophosphatemia/  \n \nAdditional Background Sourcing on Hereditary Hyperphosphatasia \n \nHereditary Hyperphosphatasia - NORD (National Organization for Rare Disorders). NORD (National \nOrganization for Rare Disorders) . 2017. Available at: https://rarediseases.org/rare-\ndiseases/hereditary-hyperphosphatasia/. Accessed July 5, 2017. \nhttps://rarediseases.org/rare-diseases/hereditary-hyperphosphatasia/'