## Project: Question-Answering on Private Documents

In [1]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [2]:
def load_documents(file):
    import os
    name, extension = os.path.splitext(file)
    
    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format is not supported')
        return none

    data = loader.load()
    return data

def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [3]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter (chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

In [4]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.004:.6f}')

### Embedding and Uploading to a vector Vector Database (Picecone)

In [5]:
def insert_of_fetch_embeddings(index_name):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings
    
    embeddings = OpenAIEmbeddings()
    
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name in pinecone.list_indexes():
        print(f'Index {index_name} already exists. Loading embeddings ...', end='')
        vector_store = Pinecone.from_existing_index(embeddings, index_name=index_name)
        print('OK')
    else:
        print(f'Creating index {index_name} and embeddings', end='')
        pinecone.create_index(index_name, dimension=1536, metric='cosine')
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
    return vector_store

In [6]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))
    
    if index_name == 'all':
        indexes = pinecone.list_indexes()
        print('Deleting all indexes')
        for index in indexes:
            pinecone.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {inindex_name} ...', end='')
        pinecone.delete_index(index_name)
        print('OK')
        
    

In [7]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
    
    chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)
    
    answer=chain.run(q)
    
    return answer

def ask_with_memory(vector_store, question, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI
    
    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)
    
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})
    
    crc = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever)
    
    result = crc({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history

### Running code

In [8]:
data = load_documents('./dvlottery.pdf')
# print(data[1].page_content)
# print(data[1].metadata)


Loading ./dvlottery.pdf


In [9]:
# data = load_from_wikipedia('GPT-4')
# print(data[0].page_content)

In [10]:
chunks = chunk_data(data)

In [11]:
len(chunks)

280

In [12]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes
Ok


In [13]:
index_name = 'askdocument'
vector_store = insert_of_fetch_embeddings(index_name)

Creating index askdocument and embeddingsOk


In [14]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

The document is about the instructions for the DV-2025 Diversity Immigrant Visa Program. It provides an overview of the program and explains the process and requirements for participation.


In [15]:
#asking with memory
chat_history = []
question = 'What is the minimum age'
result, chat_history = ask_with_memory(vector_store, question)
print(result['answer'])
print(chat_history)


There is no minimum age to apply for the E-DV Program.
[('What is the minimum age', 'There is no minimum age to apply for the E-DV Program.')]


In [16]:
chat_history = []
question = 'Minimum age  plus 18'
result, chat_history = ask_with_memory(vector_store, question)
print(result['answer'])
print(chat_history)

The age limit for the E-DV Program is not specified in the given context. It only mentions that individuals under the age of 18 will be disqualified.
[('What is the minimum age', 'There is no minimum age to apply for the E-DV Program.'), ('Minimum age  plus 18', 'The age limit for the E-DV Program is not specified in the given context. It only mentions that individuals under the age of 18 will be disqualified.')]
