In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

Load Data

In [2]:
loader = PyPDFLoader("\\Users\\wolfe\\OneDrive\\Desktop\\BOOKS\\calculus-early-transcendentals-8th-ed.pdf")

In [3]:
data = loader.load()

In [4]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 1404 document(s) in your data
There are 1838 characters in your document


Chunk data into smaller chunks

In [5]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [6]:
print (f'Now you have {len(texts)} documents')

Now you have 2661 documents


Create embeddings of your documents to get ready for semantic search

In [7]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from tqdm.autonotebook import tqdm
import pinecone

  from tqdm.autonotebook import tqdm


In [8]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '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')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '9d837134-08fe-4023-a178-f361ae686679')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [10]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [11]:
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "wolfe-test"

In [12]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [21]:
query = "How do I find the acceleration of my car?"
docs = docsearch.similarity_search(query)

In [25]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:500])

what does that information indicate to us? We know that if the velocity remains constant, 
then after an hour we will have traveled 48 mi. But if the velocity of the car varies, what 
does it mean to say that the velocity at a given instant is 48 miyh?
In order to analyze this question, let’s examine the motion of a car that travels along a 
straight road and assume that we can measure the distance traveled by the car (in feet) at  
l-second intervals as in the following chart:t−Time elapsed ssd


Query those docs to get your answer back

In [15]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [16]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [19]:
query = "How do I find the velocity of my car?"
docs = docsearch.similarity_search(query)

In [20]:
chain.run(input_documents=docs, question=query)

' The velocity of your car can be found by measuring the distance traveled by the car (in feet) at 1-second intervals and then calculating the average velocity over a given time interval. You can also graph the motion of the car by plotting the distance traveled as a function of time. The velocity at a given instant is then the slope of the tangent line to the curve at that point.'