# Load necessary libraries

In [6]:
from langchain.document_loaders import PyMuPDFLoader
import pprint # pretty printing
import os
from dotenv import load_dotenv
load_dotenv() #load the openai key

True

# Parse multiple files uisng PyMuPDFLmoader

In [13]:
# Load the files
# Name of files or file path
pdf_files = ['../Data/Raw/vitDandCaonbirthweightMRvsRCT.pdf', '../Data/Raw/IVanalysisViandaStel.pdf']
# Set an empty list to hold the documents once parsed
documents = [] # This empty list will be populated with the parsed pdf files

for file in pdf_files:
    #pprint.pp(file)
    loader = PyMuPDFLoader(file)
    documents.extend(loader.load())

'View the content of the multiple files'
for item in documents:
    pprint.pp(item.page_content)

 # Split text into chunks and set overlaps

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs = text_splitter.split_documents(documents)

# Create embeddings and store in a vector DB

In [22]:
from langchain_openai import OpenAIEmbeddings #Creation fo embeddings
from langchain.vectorstores import FAISS # vector stores

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs,embeddings)

# Create a retriever and LLM chain

In [27]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model_name='gpt-4-turbo')
retriever = vectorstore.as_retriever(search_kwargs={"k": 7})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type='stuff'
)

# Query the PDFs


In [28]:
query = "What is the aim of this study"
result = qa_chain.invoke(query)
print(result)

{'query': 'What is the aim of this study', 'result': 'The aim of the study is to explore whether there are causal effects of maternal circulating 25(OH)D (vitamin D) and calcium on birth weight (BW), and to quantify what the magnitude of these effects are if present. The study utilizes data from recent systematic reviews, meta-analyses, and includes new data from the UK Biobank. It employs multiple analytical methods including two-sample summary data Mendelian randomization (MR) and instrumental variables applied to randomized controlled trials (RCTs), aiming to triangulate the results from these different methods to compare and validate the findings. This approach is designed to address the potential biases specific to each method and ensure a robust examination of the causative impact of 25(OH)D and calcium on birth weight.'}


In [31]:
retrieved_docs = retriever.invoke(query)
print("\nRetrieved Documents:")
for doc in retrieved_docs:
    pprint.pp(f"Source: {doc.metadata.get('source', 'Unknown')}, Content: {doc.page_content[:200]}...")


Retrieved Documents:
('Source: ../Data/Raw/vitDandCaonbirthweightMRvsRCT.pdf, Content: here we used '
 'RCTs of randomisation to vitamin D supplements to quantify the effect of '
 'cir-\n'
 'culating 25(OH)D on BW. This differs from the original aim and analyses of '
 'these RCTs,\n'
 'which was to determine...')
('Source: ../Data/Raw/vitDandCaonbirthweightMRvsRCT.pdf, Content: use MR to '
 'explore whether there are causal effects of maternal circulating 25(OH)D and '
 'cal-\n'
 'cium on BW and, if so, what the magnitude those effects are. With the '
 'release of new UK Bio-\n'
 'bank (UKB) data...')
('Source: ../Data/Raw/IVanalysisViandaStel.pdf, Content: ogy when '
 'investigating the effect of therapy on the outcome\n'
 '[6, 16–22]. In addition, the method can be used for other\n'
 'ORIGINAL ARTICLE\n'
 'V.S. Stel et al.\n'
 '1696\n'
 'Downloaded from https://academic.oup.com/nd...')
('Source: ../Data/Raw/vitDandCaonbirthweightMRvsRCT.pdf, Content: main results '
 'and all other l