In [1]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

Load Data

In [5]:
loader = PyPDFLoader("\\Users\\wolfe\\OneDrive\\Desktop\\BOOKS\\calculus-early-transcendentals-8th-ed.pdf")

In [6]:
data = loader.load()

In [7]:
print (f'You have {len(data)} document(s) in your data')
print (f'There are {len(data[30].page_content)} characters in your document')

You have 1404 document(s) in your data
There are 1838 characters in your document


Chunk data into smaller chunks

In [8]:
# Note: If you're using PyPDFLoader then we'll be splitting for the 2nd time.
# This is optional, test out on your own data.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(data)

In [9]:
print (f'Now you have {len(texts)} documents')

Now you have 2661 documents


Create embeddings of your documents to get ready for semantic search

In [10]:
from langchain.vectorstores import Chroma, Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone

  from tqdm.autonotebook import tqdm


In [38]:
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', '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')

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY', '9d837134-08fe-4023-a178-f361ae686679')
PINECONE_API_ENV = os.environ.get('PINECONE_API_ENV', 'gcp-starter')

In [39]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [40]:
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "wolfe-test"

In [None]:
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [None]:
query = "What are examples of good data science teams?"
docs = docsearch.similarity_search(query)

In [None]:
# Here's an example of the first document that was returned
print(docs[0].page_content[:450])

Query those docs to get your answer back

In [None]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [None]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [None]:
query = "What is the collect stage of data maturity?"
docs = docsearch.similarity_search(query)

In [None]:
chain.run(input_documents=docs, question=query)