# Project: Question-answering on Private Documents

In [3]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [4]:
def load_document(path):
    name, extension = os.path.splitext(path)

    if extension == ".pdf":
        from langchain.document_loaders import PyPDFLoader
        print(f"Loading {path} using PyPDFLoader...")
        loader = PyPDFLoader(path)
    elif extension == ".docx":
        from langchain.document_loaders import Docx2txtLoader
        print(f"Loading {path} using Docx2txtLoader...")
        loader = Docx2txtLoader(path)
    else:
        print("Document format is not supported. Returning empty document")
        return []
    data = loader.load()
    return data

def load_from_wikipedia(query, lang="en", load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data

In [5]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks

In [6]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model("text-embedding-ada-002")
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f"Total Tokens: {total_tokens}")
    print(f"Embedding cost in USD: {total_tokens / 1000 * 0.0004:.6f}")

In [7]:
def insert_or_fetch_embeddings(index_name, chunks=None):
    import pinecone
    from langchain.vectorstores import Pinecone
    from langchain.embeddings.openai import OpenAIEmbeddings

    embeddings = OpenAIEmbeddings()
    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENV"))

    if index_name in pinecone.list_indexes():
        print(f"Index {index_name} exists! Loading embeddings from existing environment... ", end="")
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print("Loading successful")

    else:
        print(f"Creating index {index_name} and embeddings...", end="")
        pinecone.create_index(index_name, dimension=1536, metric="cosine")
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print("Created embeddings from chunks")

    return vector_store

def delete_pinecone_index(index_name="all"):
    import pinecone

    pinecone.init(api_key=os.environ.get("PINECONE_API_KEY"), environment=os.environ.get("PINECONE_ENV"))

    if index_name == "all":
        indexes = pinecone.list_indexes()
        print("Deleting all indexes ...")
        for index in indexes:
            pinecone.delete_index(index)
        print("Deleted all indexes.")
    else:
        print(f"Deleting index {index_name} ...", end="")
        pinecone.delete_index(index_name)
        print(f"Deleted index {index_name}.")

In [8]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=1)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.run(q)

    return answer

def ask_with_memory(vector_store, q, chat_history=[]):
    from langchain.chains import ConversationalRetrievalChain
    from langchain.chat_models import ChatOpenAI

    llm = ChatOpenAI(temperature=1)

    retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc({"question": q, "chat_history": chat_history})
    chat_history.append((q, result["answer"]))

    return result["answer"], chat_history

## Loading a local PDF

In [9]:
filepath = "files/kybalion.pdf"
data = load_document(filepath)

Loading files/kybalion.pdf using PyPDFLoader...


In [10]:
print(len(data))

58


In [11]:
os.path.splitext(filepath)

('files/kybalion', '.pdf')

In [12]:
chunks = chunk_data(data, chunk_size=512)
print(len(chunks))
print(chunks[11].page_content)

503
since his time. All the fundamental and basic teachings embedded in the esoteric teachings 
of every race may be traced back to Hermes. Even the most ancient teachings of India undoubtedly have their roots in the original Hermetic Teachings. 
From the land of the Ganges many advanced occultists wandered to the land of Egypt, 
and sat at the feet of the Master. From him they obtained the Master-Key which explained 
and reconciled their divergent views, and thus the Secret Doctrine was firmly established.


In [13]:
print_embedding_cost(chunks)

Total Tokens: 44847
Embedding cost in USD: 0.017939


In [12]:
delete_pinecone_index()

  from tqdm.autonotebook import tqdm


Deleting all indexes ...
Deleted all indexes.


In [14]:
index_name = "askadocument"
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

  from tqdm.autonotebook import tqdm


Index askadocument exists! Loading embeddings from existing environment... Loading successful


In [14]:
q = "what is the whole document about?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

The document is about exploring the nature of the Universe and the concept of "THE ALL" - a term used to describe the ultimate truth or reality that is beyond complete understanding or definition. It touches upon the idea that nothing exists outside of THE ALL, but also acknowledges that the Universe is constantly changing and made up of many parts. It suggests that while the essential nature of THE ALL is unknowable, there are certain truths about its existence that the human mind is compelled to accept. The document also mentions the reports and insights of the Illumined, who are believed to have knowledge from higher planes.


In [32]:
import time
i = 1
print("Write Quit or Exit to quit.")
while True:
    q = input(f"Question #{i}: ")
    i = i+1
    if q.lower() in ["quit", "exit"]:
        print("Quitting.")
        time.sleep(2)
        break

    answer = ask_and_get_answer(vector_store, q)
    print(f"\nAnswer: {answer}")
    print(f"\n {'-'*50} \n")
    

Write Quit or Exit to quit.


Question #1:  What is the most important messages of the text?



Answer: The most important message of the text is that the essential nature of "THE ALL" is unknowable, but there are certain truths connected with its existence that the human mind must accept. The text also suggests that there is a reconciliation between seemingly contradictory ideas and theories, such as the existence of both "matter" and the ineffable nature of reality.

 -------------------------------------------------- 



Question #2:  Are any of these concepts relatable to modern physics?



Answer: Yes, some of these concepts can be related to modern physics. For example, the principle of cohesion, chemical affinity, and gravitation mentioned in the passage are fundamental concepts in physics. Cohesion refers to the force that holds particles together within a substance, such as the attraction between water molecules. Chemical affinity is the force that attracts atoms to form chemical bonds. Gravitation is the force of attraction between any two objects with mass. These concepts are still studied and explained in modern physics, although the passage suggests that science has not fully explained their nature.

 -------------------------------------------------- 



Question #3:  What is said on the text about electric fields and magnetism?



Answer: According to the text, electricity and magnetism are emitted when the appropriate rate of vibration is attained. Additionally, the text mentions that electricity is regarded as the "Something" into which all other forms of energy seem to melt or dissolve. Therefore, the text suggests that electricity and magnetism are closely related in the field of energy or force.

 -------------------------------------------------- 



Question #4:  quit


Quitting.


In [20]:
chat_history = []
question = "What is the main message of the text?"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result["answer"])
print(chat_history)

The main message of the text is to provide a statement of truth that reconciles apparent contradictions in occult knowledge, without promoting any specific philosophy or doctrine.
[('What is the main message of the text?', 'The main message of the text is to provide a statement of truth that reconciles apparent contradictions in occult knowledge, without promoting any specific philosophy or doctrine.')]


In [21]:
question = "can you give examples of these contractory knowledges?"
result, chat_history = ask_with_memory(vector_store, question, chat_history)
print(result["answer"])
print(chat_history)

Examples of contradictory knowledge or paradoxes can include:

1. The paradox of "thesis and antithesis": This refers to the idea that two opposing ideas or concepts can actually be similar in nature but differ only in degree. For example, the concepts of light and darkness can be seen as opposites, but they both exist on a continuum of brightness.

2. The paradox of "opposites are the same, differing only in degree": This paradox suggests that seemingly opposite things or ideas share fundamental similarities but vary in their intensity or magnitude. An example could be the concepts of love and hate, which both involve strong emotions but differ in their expression and intensity.

3. The paradox of "extremes meet": This paradox implies that two seemingly opposite or extreme ideas or conditions can converge or overlap. For instance, the concepts of order and chaos may seem contradictory, but some theories propose that systems can exhibit elements of both simultaneously.

4. The paradox 

In [16]:
import time
i = 1
chat_history=[]
print("Write Quit or Exit to quit.")
while True:
    q = input(f"Question #{i}: ")
    i = i+1
    if q.lower() in ["quit", "exit"]:
        print("Quitting.")
        time.sleep(2)
        break

    answer, chat_history = ask_with_memory(vector_store, q, chat_history)
    
    print(f"\nAnswer: {answer}")
    print(f"\n {'-'*50} \n")
    

Write Quit or Exit to quit.


Question #1:  What is are some key words on the text?



Answer: Some key words in the text are: quotation marks, credit, students, benefit, study, pages, Path to Mastery, HERMES TRISMEGISTUS, Master of Masters, Elementary Planes, Mineral, Plant, Animal, Human Mentality, Life, black keys, white keys, music, essential nature, Unknowable, truths, existence, human mind, reports, Illumined, higher planes, Fundamental Truth, Substantial Reality, Wise Men, THE ALL.

 -------------------------------------------------- 



Question #2:  Tell something in the text about the last key word.



Answer: In the text, it is mentioned that "THE ALL" is the name given to the fundamental truth and substantial reality that is beyond true naming. It is also stated that while the essential nature of "THE ALL" is unknowable, there are certain truths connected with its existence that the human mind is compelled to accept. Additionally, it is mentioned that "THE ALL" is not an atom, blind force, or lowly living thing, and that some individuals mistakenly identify themselves as identical to "THE ALL" and claim to be God. Furthermore, it is said that "THE ALL" withdraws its attention from the universe at the end of countless cycles of time, and the spirit of each soul is not annihilated but infinitely expanded, merging the created with the creator.

 -------------------------------------------------- 



Question #3:  quit


Quitting.
