In [1]:
# Query the PDF for full paper screening and potential data extraction

In [2]:
# Install necessary libraries for use

# !pip install langchain
# !pip install openai
# !pip install PyPDF2 # Read the PDF files
# !pip install faiss-cpu # store vector embeddings
# !pip install tiktoken # Creates tokens
# !pip install -U langchain-community
#!pip install -U langchain-openai
#!pip install pymupdf

# PyMuPDFLoader
This is a document loader utility from the LangChain library. This is designed to work with PDF files using PyMuPDF backend. It can read and load the pdf content from PDF documents so that it canbbe processed by other applications e.g LLMs.    

**Use cases:** Chat with PDFs, document search, summarization

**Steps**   
1. PyMuPDF parses the PDF files.   
2. Loads text content page by page.     
3. Optional to split the content by page or section.    
4. Returns the document content in a structured format that Langchain can work with.     

In [28]:
#  Load neccesary libraries and openai key upfront
from langchain.document_loaders import PyMuPDFLoader
import pprint # pretty print - python standard library for printing data structures in a more readable and nicely formatted.
# Obtain my openai key
import os
from dotenv import load_dotenv
load_dotenv() # This works as long as the .env file is in current working directory

True

# Load PDF using PyMuPDFLoader

In [38]:
loader = PyMuPDFLoader('../Data/Raw/vitDandCaonbirthweightMRvsRCT.pdf')
documents = loader.load()

# Returns two file content including: Document text and metadata
for doc in documents:
     pprint.pp(f'This is the document content{doc.page_content} and this is the metadata{doc.metadata}')

('This is the document contentRESEARCH ARTICLE\n'
 'Association of maternal circulating 25(OH)D\n'
 'and calcium with birth weight: A mendelian\n'
 'randomisation analysis\n'
 'William D. ThompsonID1,2, Jessica TyrrellID1, Maria-Carolina BorgesID2,3, '
 'Robin\n'
 'N. BeaumontID1, Bridget A. KnightID1, Andrew R. Wood1, Susan M. RingID2,3,4, '
 'Andrew\n'
 'T. HattersleyID1, Rachel M. Freathy1,2‡, Debbie A. LawlorID2,3,5‡*\n'
 '1 Institute of Biomedical and Clinical Science, College of Medicine and '
 'Health, University of Exeter, Exeter,\n'
 'United Kingdom, 2 MRC Integrative Epidemiology Unit at the University of '
 'Bristol, Bristol, United Kingdom,\n'
 '3 Population Health Science, Bristol Medical School, University of Bristol, '
 'Bristol, United Kingdom, 4 Avon\n'
 'Longitudinal Study of Parents and Children, NIHR Bristol Biomedical Research '
 'Centre, University of Bristol,\n'
 'Bristol, United Kingdom, 5 Bristol NIHR Biomedical Research Centre, Bristol, '
 'United Kingdom\n'
 

# 2. Split text into chunks

In [30]:
# Split text into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
docs = text_splitter.split_documents(documents)

# 3. Create embeddings and store in vector DB

In [31]:
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(docs,embeddings)

# 4. Create a Retriever and LLM chain

In [32]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name='gpt-4-turbo',openai_api_key=api_KEY)
retriever = vectorstore.as_retriever()
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type='stuff'
)

# 5. Query the pdf

In [39]:
query = "What is the aim of this study"
result = qa_chain.invoke(query)
pprint.pp(result)

{'query': 'What is the aim of this study',
 'result': 'The aim of the study is to explore whether there are causal '
           'effects of maternal circulating 25-hydroxyvitamin D (25(OH)D) and '
           'calcium on birth weight (BW) and to determine the magnitude of '
           'those effects. The study employs Mendelian Randomization (MR) and '
           'triangulates these results with findings from instrumental '
           'variable analyses applied to randomized controlled trials (RCTs) '
           'of vitamin D or calcium supplementation. This approach aims to '
           'provide a more accurate assessment of the causal relationships by '
           'comparing results from different methodological approaches that '
           'have unique potential sources of bias.'}
