In [6]:
# Load the libraries that are needed
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

import os
import random

In [7]:
# Load the document, please set this location to where your PDF resides
# There are some example pdf's in the data folder in the repo that you can use
loader = PyPDFLoader("/mnt/code/Select_Global_Value_Fund.pdf")
texts = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0))

In [8]:
print(f"There are {len(texts)} pages in the document")

There are 71 pages in the document


In [10]:
# Pick a sample page
print(texts[random.randint(0, len(texts))])

page_content='controller (2019–2022) and director of Audit Services(2017–2019) at Vanguard. Senior manager\n(2015–2017) at PriceWaterhouseCoopers (audit and\nassurance, consulting, and tax services).\nPeter Mahoney\nBorn in 197 4. Principal occupation(s) during the past\nfive years and other experience: principal of Vanguard.\nController (2015–present) of each of the investment\ncompanies served by Vanguard. Head of International\nFund Services (2008–2014) at Vanguard.\nAnne E. Robinson\nBorn in 1970. Principal occupation(s) during the past\nfive years and other experience: general counsel\n(2016–present) of Vanguard. Secretary (2016–present)\nof Vanguard and of each of the investment companies\nserved by Vanguard. Managing director\n(2016–present) of Vanguard. Managing director and\ngeneral counsel of Global Cards and ConsumerServices (2014–2016) at Citigroup. Counsel\n(2003–2014) at American Express. Nonexecutive\ndirector of the board of National Grid (energy).Michael Rollings' meta

In [None]:
#Create embeddings of your documents to get ready for semantic search

from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
import pinecone
import pickle


In [12]:
# Read your OpenAI and Pinecone keys from the environment
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') 
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [16]:
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [17]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "vanguard-etf"

In [19]:
# Generate and store the embeddings in Pinecone

# Please ensure that the index exists in Pinecone and the dimensions match those of OpenAI embeddings (1536). 
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

In [20]:
# Query the docs to get your answer back
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [21]:
# Set the temperature to 0 to prevent hallucination
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [23]:
# Ask your query
query = "Who are the trustees of the fund?"
# Get the closest matches to create some context and information for the answer
docs = docsearch.similarity_search(query)

In [None]:
docs

In [25]:
# Execute the query with the context and information from the embedding/semantic search
chain.run(input_documents=docs, question=query)

' Tara Bunch, Emerson U. Fullwood, and Mortimer J. Buckley.'

In [27]:
# Compare the result with gpt-3.5-turbo, the default above is davinci
llm = OpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

chain = load_qa_chain(llm, chain_type="stuff")

chain.run(input_documents=docs, question=query)

'The trustees of the fund are Tara Bunch, Emerson U. Fullwood, and Mortimer J. Buckley.'