# Question answering on Private Document

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
pip install -qU langchain_community pypdf docx2txt wikipedia langchain-text-splitters tiktoken langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [6]:
def load_document(file):
    
    import os 
    name, extension = os.path.splitext(file)
    
    if extension == '.pdf':
        # for pdf
        from langchain_community.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
        
    elif extension == ".docx":
        # for word doc
        from langchain_community.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported')
        return None
        
    # return list of langchain doc (1 doc for each page)
    docs = loader.load()
    return docs

# wikipedia 
# query: text find doc in wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain_community.document_loaders import WikipediaLoader
    
    # load_max_docs is to limit number of downloaded documents
    docs = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs).load()
    return docs

- chunk_size: The maximum size of a chunk, where size is determined by the length_function.
- chunk_overlap: Target overlap between chunks. Overlapping chunks helps to mitigate loss of information when context is divided between chunks.

In [11]:
def chunk_data(data, chunk_size = 256):
    from langchain_text_splitters import RecursiveCharacterTextSplitter
    # automatically split ["\n\n", "\n", " ", ""]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap=0
    )
    # return list of data
    chunks = text_splitter.split_documents(data)
    return chunks

# Calculating Cost of openAI embedding

In [15]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

# Embedding and Uploading to Vector Database (Pinecone)

In [None]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if pc.has_index(index_name):
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ) 
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store

In [None]:
def delete_pinecone_index(index_name='all'):
    from pinecone import Pinecone
    pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
    
    if index_name == 'all':
        # this delete all the indexes
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')

# Asking and Getting Answers

In [None]:
def ask_and_get_answer(vector_store, q, k=3):
    # vector store: source of knowledge
    # q: question
    # 1. Retrieve most relevant chunk from vector db for question
    # 2. Feed chunk to llm to get final answer in natural language
    
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4o-mini', temperature=1)
    
    # search for 3 most relevant chunks of information
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

## Running Code

#### Loading PDF

In [12]:
data = load_document("files/us_constitution.pdf")
# second page
# print(data[1].page_content[:100])
# print(data[1].metadata)
# print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters')

Loading files/us_constitution.pdf
There are 1173 characters


#### Loading word doc

In [4]:
data = load_document("files/the_great_gatsby.docx")
print(data[0].page_content[:100])

Loading files/the_great_gatsby.docx
The Project Gutenberg eBook of The Great Gatsby, by F. Scott Fitzgerald



This eBook is for the use


#### Loading wikipedia info

In [8]:
data = load_from_wikipedia(query="HUNTER X HUNTER")
data[0].metadata

{'title': 'Hunter × Hunter',
 'summary': 'Hunter × Hunter (pronounced "hunter hunter") is a Japanese manga series written and illustrated by Yoshihiro Togashi. It has been serialized in Shueisha\'s shōnen manga magazine Weekly Shōnen Jump since March 1998, although the manga has frequently gone on extended hiatuses since 2006. Its chapters have been collected in 38 tankōbon volumes as of September 2024. The story focuses on a young boy named Gon Freecss who discovers that his father, who left him at a young age, is actually a world-renowned Hunter, a licensed professional who specializes in fantastical pursuits such as locating rare or unidentified animal species, treasure hunting, surveying unexplored enclaves, or hunting down lawless individuals. Gon departs on a journey to become a Hunter and eventually find his father. Along the way, Gon meets various other Hunters and encounters the paranormal.\nHunter × Hunter was adapted into a 62-episode anime television series by Nippon Animat

In [9]:
data[0].page_content[:400]

'Hunter × Hunter (pronounced "hunter hunter") is a Japanese manga series written and illustrated by Yoshihiro Togashi. It has been serialized in Shueisha\'s shōnen manga magazine Weekly Shōnen Jump since March 1998, although the manga has frequently gone on extended hiatuses since 2006. Its chapters have been collected in 38 tankōbon volumes as of September 2024. The story focuses on a young boy nam'

In [None]:
# Chunk document (embed with little noise possible while keeping sematically relevant) 

In [14]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[1].page_content)

224
Blessings of Liberty to ourselves and our Posterity , do ordain and 
 establish this Constitution for the United States of America. 
The Constitutional Con v ention 
 Article I 
 Section 1: Congress


In [18]:
print_embedding_cost(chunks)

Total Tokens: 9842
Embedding Cost in USD: 0.000197
