In [2]:
# Load the libraries that are needed
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
import random

In [3]:
# Load the document that you need to parse, please change the location to where the pdf resides
# There are some example pdf's in the data folder in the repo you can use

# Load 1 PDF file
loader = PyPDFLoader("/mnt/code/data/Northwind_Health_Plus_Benefits_Details.pdf")
# Load an entire folder
# loader = PyPDFDirectoryLoader("/mnt/data/RAG/")
texts = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0))

In [4]:
print(f"There are {len(texts)} pages in the document")

There are 744 pages in the document


In [5]:
# Pick a sample page
print(texts[random.randint(0, len(texts))])

page_content='oral hygiene is also important, as TMJ can be caused by dental problems such as \nmisalignment of the teeth.  \nFinally, employees should be aware that TMJ is a chronic condition, which means that it may \nrequire ongoing management. Regular visits to the he althcare provider for diagnosis and \ntreatment may be necessary in order to manage the condition. Employees should also be \naware that the Northwind Standard plan does not cover all TMJ treatments, so it is \nimportant to understand their financial responsibil ity for any treatment that is not covered.  \nIn conclusion, the Northwind Standard plan does provide coverage for TMJ care, but there \nmay be certain treatments and services that are not covered. It is important for employees \nto understand their financial res ponsibility prior to receiving treatment, and to discuss their \ncondition and treatment options with their healthcare provider. Additionally, it is important' metadata={'source': '/mnt/data/RAG/North

In [6]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
import pinecone
import pickle


  from tqdm.autonotebook import tqdm


In [8]:
# Read your OpenAI key from the environment

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [10]:
# Index and store the embeddings locally in a pickle file
store = FAISS.from_texts([t.page_content for t in texts], embeddings)
with open("healthcareplandetails.pkl", "wb") as f:
    pickle.dump(store, f)


In [9]:
# Load the embeddings from the pickle file; change the location if needed
if 'store' not in locals() or store is None:
    with open("healthcareplandetails.pkl", "rb") as f:
        store = pickle.load(f)
        
qa = RetrievalQA.from_chain_type(llm=ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0, openai_api_key=OPENAI_API_KEY), chain_type="stuff", retriever=store.as_retriever())

query = "Which does my plan cover?"
docs = store.similarity_search(query)
qa({"input_documents": docs, "query": query}, return_only_outputs=True)

{'result': "Based on the provided information, the Northwind Standard plan covers mental health services, physical therapy, and inpatient care for newborns. However, it does not cover home health care, long-term care, alternative therapies (such as acupuncture, massage, and chiropractic care), or cosmetic treatments. It is important to review your specific plan to understand what is and isn't covered in detail."}