# Project: Question-Answering on Private Documents

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
# pip install pypdf -q

In [3]:
# pip install docx2txt -q

In [4]:
pip install wikipedia -q

Note: you may need to restart the kernel to use updated packages.


### Loading Documents

In [5]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data
  

### Chunking Data

In [6]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

### Calculating Cost

In [7]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.0004:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [8]:
def insert_or_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [9]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes ... ')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pinecone.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [10]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.run(q)
    return answer
    
    
def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 3})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history
    

### Running Code

In [11]:
data = load_document('files/us_constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading files/us_constitution.pdf
You have 41 pages in your data
There are 1137 characters in the page


In [12]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [13]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [14]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

190


In [15]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.006684


In [None]:
delete_pinecone_index()

Deleting all indexes ... 


In [23]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name)

Index askadocument already exists. Loading embeddings ... Ok


In [24]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

I'm sorry, but I don't have access to the document you are referring to. Without more information, I am unable to determine what the whole document is about.


In [None]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

In [None]:
delete_pinecone_index()

In [20]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_or_fetch_embeddings(index_name)

Creating index chatgpt and embeddings ...

ApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=UTF-8', 'date': 'Sat, 22 Jul 2023 22:00:31 GMT', 'x-envoy-upstream-service-time': '1383', 'content-length': '131', 'server': 'envoy'})
HTTP response body: The index exceeds the project quota of 1 pods by 1 pods. Upgrade your account or change the project settings to increase the quota.


In [None]:
# q = "Ce este ChatGPT?"
q = 'Ce este InstructGPT?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

In [None]:
# asking with memory
chat_history = []
question = 'How many amendments are in the U.S. Constitution?'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)

In [None]:
question = 'Multiply that number by 2'
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result['answer'])
print(chat_history)
