<a href="https://colab.research.google.com/github/tractorjuice/Building_BoK/blob/main/Building_Wardley_Mapping_Body_of_Knowledge_Part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Body of Knowledge using Pinecone, Langchain and OpenAI
## Part 5, query the PDF book using ChatGPT

This example shows how to create and query an internal knowledge base using ChatGPT.

This does not require a GPU runtime.

## Set Up


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
import os

DOCS_FOLDER = "/content/gdrive/Mydrive/WardleyKB"  # Google drive folder to save the audio clips from YouTube videos
BOOK_FOLDER = os.path.join(DOCS_FOLDER, "book")  # Sub-directory for audio files
PAGES_FOLDER = os.path.join(BOOK_FOLDER, "pages")  # Sub-directory for audio files

# Check if directory exists and if not, create it
if not os.path.exists(DOCS_FOLDER):
    os.makedirs(DOCS_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(BOOK_FOLDER):
    os.makedirs(BOOK_FOLDER)

# Check if sub-directory for audio exists and if not, create it
if not os.path.exists(PAGES_FOLDER):
    os.makedirs(PAGES_FOLDER)


## Build the datastore

### Load documents and split them into chunks for conversion to embeddings

In [None]:
!pip install -q pypdf
!pip install -q langchain
!pip install -q openai

### Scan and find all documents

In [None]:
documents = []
for root, dirs, files in os.walk(PAGES_FOLDER):
    for name in files:
        documents.append(os.path.join(root, name))

print(documents)

### Setup Pinecone Vector Store

In [None]:
!pip install -q pinecone-client
!pip install tiktoken
from langchain.vectorstores import Pinecone
from tqdm.auto import tqdm
import pinecone

# initialize pinecone
pinecone.init(
    api_key="",  # find at app.pinecone.io
    environment=""  # next to api key in console
    )

index_name = ""
name_space = ""

### Set up OPEN_API_KEY and necessary variables

In [None]:
os.environ["OPENAI_API_KEY"] = ""

### Upsert data into vector database

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader# Core Requirements
embeddings = OpenAIEmbeddings()

for files in documents:
    pages = []
    loader = PyPDFLoader(files)
    pages.extend(loader.load_and_split())
    print (pages)

    vector_store = Pinecone.from_documents(pages, embeddings, namespace=name_space, index_name=index_name)

## Query using the vector store with ChatGPT integration
### Set up the chat model and specific prompt

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
vector_store = Pinecone.from_existing_index(index_name, embeddings, namespace=name_space)

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

chain_type_kwargs = {"prompt": prompt}
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=256)  # Modify model_name if you have access to GPT-4
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs
)

#### Use the chain to query

In [None]:
query = "what is inertia?"
result = chain(query)

In [None]:
print(result['question'])
print(result['answer'])
print(result['sources'])
print(result)

In [None]:
source_documents = result['source_documents']
for index, document in enumerate(source_documents):
    print(f"Source {index + 1}:\n")
    print(f"Page Content: {document.page_content}\n")
    print(f"Source: {document.metadata['source']}\n")