In [9]:
import os
from dotenv import load_dotenv
import openai
from llama_index import download_loader, SimpleDirectoryReader, Document
from llama_index.llms import OpenAI
from trulens_eval import Tru

from sentence_window_retrieval import build_sentence_window_index, get_sentence_window_query_engine
from auto_merging_retrieval import build_automerging_index, get_automerging_query_engine
from trulens_utils import trulens_recorder, run_evals

In [10]:
_ = load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
notion_token = os.getenv('NOTION_INTEGRATION_TOKEN')

In [11]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [18]:
documents = SimpleDirectoryReader(
    input_files=["./docs/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [30]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.schema.Document'>
Doc ID: 795df622-ee93-4e2e-bbac-0364777641ae
Text: PAGE 1Founder, DeepLearning.AICollected Insights from Andrew Ng
How to  Build Your Career in AIA Simple Guide


In [32]:
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_chat_engine()

In [34]:
response = query_engine.chat("what does the author say about how to build a successful career in AI")
print(response)

I'm sorry, but I don't have access to specific authors or their opinions unless you provide me with the name of the author or a specific book or article.


In [36]:
query_engine.reset()

In [None]:
query_engine.chat_repl()

In [13]:
from llama_index.schema import MetadataMode

document = Document(
    text="This is a super-customized document",
    metadata={
        "file_name": "super_secret_document.txt",
        "category": "finance",
        "author": "LlamaIndex",
    },
    excluded_llm_metadata_keys=["file_name"],
    metadata_seperator="::",
    metadata_template="{key}=>{value}",
    text_template="Metadata: {metadata_str}\n-----\nContent: {content}",
)


print(
    "The LLM sees this: \n",
    document.get_content(metadata_mode=MetadataMode.LLM),
)
print(
    "The Embedding model sees this: \n",
    document.get_content(metadata_mode=MetadataMode.EMBED),
)

The LLM sees this: 
 Metadata: category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document
The Embedding model sees this: 
 Metadata: file_name=>super_secret_document.txt::category=>finance::author=>LlamaIndex
-----
Content: This is a super-customized document


In [None]:
## How to automatically extract metadata?



In [15]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_hub.web.simple_web.base import SimpleWebPageReader

reader = SimpleWebPageReader(html_to_text=True)

docs = reader.load_data(urls=["https://eugeneyan.com/writing/llm-patterns/"])