In [17]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [84]:
loader = UnstructuredPDFLoader("data/new-testament.pdf")
# loader = UnstructuredPDFLoader("data/field-guide-to-data-science.pdf")

In [85]:
data = loader.load()

detectron2 is not installed. Cannot use the hi_res partitioning strategy. Falling back to partitioning with the fast strategy.


In [86]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[0].page_content)} characters in your document')

You have 1 document(s) in your data
There are 3489245 characters in your document


In [87]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [88]:
print (f'Now you have {len(texts)} documents')

Now you have 3624 documents


In [89]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

In [90]:
OPENAI_API_KEY = '...'
PINECONE_API_KEY = '...'
PINECONE_API_ENV = '...'

In [91]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [92]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "gpt-4-chatbot-langchain"

In [93]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [94]:
query = "What was the name of the angel of the Lord who appeared unto Zacharias?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [95]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [96]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [97]:
query = "What was the name of the angel of the Lord who appeared unto Zacharias?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [98]:
chain.run(input_documents=docs, question=query)

' The angel of the Lord who appeared unto Zacharias was named Gabriel.'

In [99]:
query = "Who is Jesus's mom?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [100]:
chain.run(input_documents=docs, question=query)

' Mary'

In [101]:
query = "Who is Jesus's favourite disciple?"
docs = docsearch.similarity_search(query, include_metadata=True)

In [102]:
chain.run(input_documents=docs, question=query)

' The Beloved Disciple is believed to be the Apostle John, one of the sons of Zebedee.'