# Embed multiple PDFs and query with OpenAI API

This Jupyter Notebook takes us through step by step how to load, parse, encode, and embed PDF files and then how to query them as a corpus using OpenAI APIs.

In [1]:
# You can run these one at a time to confirm you have the libraries that are needed in the block below. 
# Just take away the "#" and execute the cell.

#pip install PyPDF4
#pip install langchain
#pip install python-dotenv
#pip install pdfplumber

In [74]:
import pdfplumber
import PyPDF4
import re
import os
import sys
from typing import Callable, List, Tuple, Dict

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.chains import ConversationalRetrievalChain
from langchain.schema import HumanMessage, AIMessage

from dotenv import load_dotenv

In [75]:
# You will need to switch out this API key with your own. This one has been deleted.
# platform.openai.com
os.environ["OPENAI_API_KEY"] = "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"

## Steps:
1. Get file paths of the PDFs
2. Parse the PDFs into pages with page numbers 
3. Clean up the text
4. Create text chunks from each page, preserving page number and metadata as well as chunk number
5. Generate embeddings from the chunks
6. Store the embeddings so they can be accessed by an AI API
7. Enter into Q&A with the AI API

### Step 1. Get file paths of the PDFs

In [76]:
def get_file_paths_and_names(directory):
    """
    Finds all the file names and paths within a directory
    
    """
    file_paths = [os.path.join(directory, f) for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    file_names = [os.path.splitext(f)[0] for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]
    return file_paths, file_names

In [77]:
# Step 0: Get file paths of all PDFs
data_directory = "/Users/esme/Downloads/Project/src/data"
file_paths, file_names = get_file_paths_and_names(data_directory)

# Note: You may need to define data_directory as an entire file path depending on your IDE.
    # For VSCode, we just need "data/"
file_paths

['/Users/esme/Downloads/Project/src/data/marijuana.pdf',
 '/Users/esme/Downloads/Project/src/data/fake_helicopter_incidents.csv']

For Steps 2-4, we'll first do it on just one of the pdfs so we can output some of the steps as we go and see how things look. Once we have a feel for what we're doing, we'll execute script that does the steps on each of the pdfs in our directory all at once.

### Step 2. Parse the PDFs into pages with page numbers 

In [78]:
def parse_pdf(file_path: str) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
    """
    Extracts the title and text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A tuple containing the title and a list of tuples with page numbers and extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    metadata = extract_metadata_from_pdf(file_path)
    pages = extract_pages_from_pdf(file_path)

    return pages, metadata

In [79]:
def extract_metadata_from_pdf(file_path: str) -> dict:
    '''
    Extracts the metadata from a PDF.

    :param file_path: The path to the PDF file.
    :return: A dictionary of metadata types as keys and corresponding pdf information as values
    '''
    with open(file_path, "rb") as pdf_file:
        reader = PyPDF4.PdfFileReader(pdf_file)
        metadata = reader.getDocumentInfo()
        return {
            "title": metadata.get("/Title", "").strip(),
            "author": metadata.get("/Author", "").strip(),
            "creation_date": metadata.get("/CreationDate", "").strip(),
        }

In [80]:
def extract_pages_from_pdf(file_path: str) -> List[Tuple[int, str]]:
    """
    Extracts the text from each page of the PDF.

    :param file_path: The path to the PDF file.
    :return: A list of tuples containing the page number and the extracted text.
    """
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    with pdfplumber.open(file_path) as pdf:
        pages = []
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text.strip():  # Check if extracted text is not empty
                pages.append((page_num + 1, text))
    return pages

In [81]:
# Note that we call a function parse_pdf which itself calls two other functions: 
#  extract_metadata_from_pdf and extract_pages_from_pdf
raw_pages, metadata = parse_pdf(file_paths[0])# <-- Note we're only doing this on first document for now
# print(raw_pages)
print(metadata)

{'title': 'Use of Marijuana: Effect on Brain Health: A Scientific Statement From the American Heart Association', 'author': '', 'creation_date': "D:20220321130727+05'30'"}


In [82]:
# Checking on the output
type(raw_pages)

list

In [83]:
# We can see how many pages the first file has
len(raw_pages)

12

In [84]:
# We can output these pages to see what information they each contain
for i in raw_pages:
    print(i)

(1, 'Stroke\nAHA SCIENTIFIC STATEMENT\nUse of Marijuana: Effect on Brain Health:\nA Scientific Statement From the American Heart\nAssociation\nThe American Academy of Neurology affirms the value of this statement as an educational tool for neurologists.\nFernando D. Testai, MD, PhD, Chair; Philip B. Gorelick, MD, MPH, Vice Chair; Hugo J. Aparicio, MD, MPH; Francesca M. Filbey, PhD;\nRaul Gonzalez, PhD; Rebecca F. Gottesman, MD, PhD; Miriam Melis, PhD; Mariann R. Piano, RN, PhD; Tiziana Rubino, PhD;\nSarah Y. Song, MD; on behalf of the American Heart Association Stroke Brain Health Science Subcommittee of the Stroke Council;\nCouncil on Arteriosclerosis, Thrombosis and Vascular Biology; Council on Cardiovascular and Stroke Nursing; Council on Lifestyle\nand Cardiometabolic Health; and Council on Peripheral Vascular Disease\nABSTRACT: Marijuana is perceived as a harmless drug, and its recreational use has gained popularity among young individuals.\nThe concentration of active ingredients

### Step 3. Clean up the text

In [85]:
def clean_text(pages: List[Tuple[int, str]], cleaning_functions: List[Callable[[str], str]]) -> List[Tuple[int, str]]:
    """
    Cleans the text in a list of pages

    :param pages: The list of pages, each containing both a string of text and a page number.
    :return: The list of pages, each containing both a string of cleaned text and a page number.
    """
    cleaned_pages = []
    for page_num, text in pages:
        for cleaning_function in cleaning_functions:
            text = cleaning_function(text)
        cleaned_pages.append((page_num, text))
    return cleaned_pages

In [86]:
# Three simple regex cleaning functions

def merge_hyphenated_words(text: str) -> str:
    return re.sub(r"(\w)-\n(\w)", r"\1\2", text)

def fix_newlines(text: str) -> str:
    return re.sub(r"(?<!\n)\n(?!\n)", " ", text)

def remove_multiple_newlines(text: str) -> str:
    return re.sub(r"\n{2,}", "\n", text)

In [87]:
# These three simple cleaning funcstions are defined below. Here, we're just putting them into a list
cleaning_functions = [
    merge_hyphenated_words,
    fix_newlines,
    remove_multiple_newlines,
    ]

# We call a function defined just below that applies each of the cleaning functions in a list to our text
cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)

In [88]:
type(cleaned_text_pdf)

list

In [89]:
len(cleaned_text_pdf)

12

In [90]:
for i in cleaned_text_pdf:
    print(i)

(1, 'Stroke AHA SCIENTIFIC STATEMENT Use of Marijuana: Effect on Brain Health: A Scientific Statement From the American Heart Association The American Academy of Neurology affirms the value of this statement as an educational tool for neurologists. Fernando D. Testai, MD, PhD, Chair; Philip B. Gorelick, MD, MPH, Vice Chair; Hugo J. Aparicio, MD, MPH; Francesca M. Filbey, PhD; Raul Gonzalez, PhD; Rebecca F. Gottesman, MD, PhD; Miriam Melis, PhD; Mariann R. Piano, RN, PhD; Tiziana Rubino, PhD; Sarah Y. Song, MD; on behalf of the American Heart Association Stroke Brain Health Science Subcommittee of the Stroke Council; Council on Arteriosclerosis, Thrombosis and Vascular Biology; Council on Cardiovascular and Stroke Nursing; Council on Lifestyle and Cardiometabolic Health; and Council on Peripheral Vascular Disease ABSTRACT: Marijuana is perceived as a harmless drug, and its recreational use has gained popularity among young individuals. The concentration of active ingredients in recreati

### Step 4. Create text chunks from each page, preserving page number and metadata as well as chunk number

In [91]:
def text_to_docs(text: List[str], metadata: Dict[str, str]) -> List[Document]:
    """
    Converts list of strings to a list of chunks with metadata.
    
    :param text: A list of strings
    :param metadata: A dictionary of meta data
    """
    doc_chunks = []

    for page_num, page in text:
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            chunk_overlap=200,
        )
        chunks = text_splitter.split_text(page)
        for i, chunk in enumerate(chunks):
            doc = Document(
                page_content=chunk,
                metadata={
                    "page_number": page_num,
                    "chunk": i,
                    "source": f"p{page_num}-{i}",
                    **metadata,
                },
            )
            doc_chunks.append(doc)

    return doc_chunks

In [92]:
# We split the pages into overlapping chunks using the text_to_docs function
document_chunks = text_to_docs(cleaned_text_pdf, metadata)

In [93]:
type(document_chunks)

list

In [94]:
# We have more chunks than we did pages
len(document_chunks)

94

In [95]:
# Notice that the meta data is still attached to each chunk: page_number, chunk, source, title, author, creation date
for i in document_chunks:
    print(i)
    print("")

page_content='Stroke AHA SCIENTIFIC STATEMENT Use of Marijuana: Effect on Brain Health: A Scientific Statement From the American Heart Association The American Academy of Neurology affirms the value of this statement as an educational tool for neurologists. Fernando D. Testai, MD, PhD, Chair; Philip B. Gorelick, MD, MPH, Vice Chair; Hugo J. Aparicio, MD, MPH; Francesca M. Filbey, PhD; Raul Gonzalez, PhD; Rebecca F. Gottesman, MD, PhD; Miriam Melis, PhD; Mariann R. Piano, RN, PhD; Tiziana Rubino, PhD; Sarah Y. Song, MD; on behalf of the American Heart Association Stroke Brain Health Science Subcommittee of the Stroke Council; Council on Arteriosclerosis, Thrombosis and Vascular Biology; Council on Cardiovascular and Stroke Nursing; Council on Lifestyle and Cardiometabolic Health; and Council on Peripheral Vascular Disease ABSTRACT: Marijuana is perceived as a harmless drug, and its recreational use has gained popularity among young individuals. The concentration of active ingredients in

We've seen how steps 2-4 work on a single document. Let's now do it for all the documents in the corpus at once. So far, we haven't done anything with OpenAI's APIs, so redoing the first document as part of a larger iterative function won't cost us anything more in that way.

In [96]:
# Only work on .pdf files
pdf_file_paths = [file for file in file_paths if file[-4:]=='.pdf']

document_chunks = []
for file in pdf_file_paths:
    print("working on ", file)
    # Step 2: Parse the PDF into pages with page numbers 
    raw_pages, metadata = parse_pdf(file)
    # Step 3: Clean up the text
    cleaning_functions = [
        merge_hyphenated_words,
        fix_newlines,
        remove_multiple_newlines,
    ]
    cleaned_text_pdf = clean_text(raw_pages, cleaning_functions)
    # Step 4: Create text chunks from each page, preserving info
    document_chunks_temp = text_to_docs(cleaned_text_pdf, metadata)
    document_chunks.extend(document_chunks_temp)
print("Done")
print("")
print("The number of total chunks is: ", len(document_chunks))

working on  /Users/esme/Downloads/Project/src/data/marijuana.pdf
Done

The number of total chunks is:  94


### Steps 5&6: Generate embeddings from the chunks, Store the embeddings so they can be accessed by an AI API

In [97]:
# We specify that we want OpenAI to handle the embeddings, because we'll be referencing them with OpenAI models
embeddings = OpenAIEmbeddings()

# You'll need to change the below folder to match where you want the embeddings to be stored.
embedding_directory = "/Users/esme/Downloads/Project/src/data/chroma"

# Chroma does the hard work for us
vector_store = Chroma.from_documents(
    document_chunks,
    embeddings,
    collection_name = "collection",
    persist_directory = embedding_directory,
)
print(vector_store)

<langchain.vectorstores.chroma.Chroma object at 0x7fa410a8f490>


### Step 7: Enter into Q&A with the AI API

In [98]:
# Specify what model and hyperparameters we want to use
model = ChatOpenAI(
    model_name = "gpt-3.5-turbo",
    temperature = "0",
    verbose = True
)

# Recall what embedding directory to reference
vector_store = Chroma(
    collection_name = "collection",
    embedding_function = embeddings,
    persist_directory = embedding_directory,
)

# Create the "chain" variable that constitutes our actual chat
chain = ConversationalRetrievalChain.from_llm(
    model,
    retriever = vector_store.as_retriever(),
    return_source_documents = True,
    verbose = True,
)

In [100]:
chat_history = []
question = input("Question: ")
# print(f'chat_history: {chat_history}')
# Generate answer
response = chain({"question": question, "chat_history": chat_history})
# print(response)
# Retrieve answer
answer = response["answer"]
source = response["source_documents"]
# print(f'source: {source}')
print(f'answer: {answer}')
chat_history.append(HumanMessage(content=question))
chat_history.append(AIMessage(content=answer))

# Display answer
# print("\n\nSources:\n")
# for document in source:
#     print(f"Title: {document.metadata['title']}")
#     print(f"Page: {document.metadata['page_number']}")
#     print(f"Text chunk: {document.page_content[:200]}...\n")
# print(f"Answer: {answer}")

Question:  what does your source say about THC?




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the users question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
oncentramarijuana in 20141 and that 22 million met criteria for can- tion of the primary psychoactive constituent of marijuana, nabis use disorder in 2016.2 In addition, according to the Δ9-tetrahydrocannabinol (THC), has gradually increased 2002 to 2019 National Survey on Drug Use and Health, from ≈4% in 1995 to 15% in 2018.5 the proportion of the US population >12 years of age who Cannabinoid receptors are expressed in high density used marijuana in the past year increased gradually from in areas of the brain involved in executive function and The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Ins