In [1]:
import os
from dotenv import load_dotenv
import openai
from llama_index import download_loader, SimpleDirectoryReader, Document
from llama_index.llms import OpenAI
from trulens_eval import Tru

from sentence_window_retrieval import build_sentence_window_index, get_sentence_window_query_engine
from auto_merging_retrieval import build_automerging_index, get_automerging_query_engine
from trulens_utils import trulens_recorder, run_evals

In [2]:
_ = load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
notion_token = os.getenv('NOTION_INTEGRATION_TOKEN')

In [3]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

# Load Data Source

In [4]:
NotionPageReader = download_loader('NotionPageReader')

page_ids = ["491ea0f6b03147bb8dbc78d5ba6d058d"]
documents = NotionPageReader(integration_token=notion_token).load_data(
    page_ids=page_ids
)

In [9]:
documents_pdf = SimpleDirectoryReader(
    input_files=["./docs/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

In [10]:
print(type(documents_pdf), "\n")
print(len(documents_pdf), "\n")
print(type(documents_pdf[0]))
print(documents_pdf[0])

<class 'list'> 

41 

<class 'llama_index.schema.Document'>
Doc ID: d251a943-b034-4478-b68d-530810a0f098
Text: PAGE 1Founder, DeepLearning.AICollected Insights from Andrew Ng
How to  Build Your Career in AIA Simple Guide


In [6]:
document = Document(text="\n\n".join([doc.text for doc in documents]))

# Set Up Evaluation (with TruLens)

In [4]:
def get_eval_questions(path_eval_questions):
    eval_questions = []
    with open(path_eval_questions, 'r') as file:
        for line in file:
            # Remove newline character and convert to integer
            item = line.strip()
            eval_questions.append(item)
    return eval_questions

In [5]:
eval_questions = get_eval_questions("./eval_questions/generated_questions_01_05.text")
Tru().reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


# Sentence window retrieval

In [7]:
# Sentence window retrieval
sentence_window_size = 3
sentence_index = build_sentence_window_index(
    [document],
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=sentence_window_size,
    save_dir="./sentence_index_notion",
)
sentence_window_engine = get_sentence_window_query_engine(sentence_index, similarity_top_k=6)

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [12]:
window_response = sentence_window_engine.query(
    "how can i dominate my first week"
)
print(str(window_response))

To dominate your first week, it is important to be well-prepared. Start by familiarizing yourself with any relevant materials or resources provided by your team or organization. Additionally, make sure to attend any scheduled meetings or orientations to gather important information and ask any questions you may have. Take the time to understand the expectations and goals for your role, and consider reaching out to your team members or supervisor for guidance or clarification. Finally, establish a routine and schedule that works for you, allowing you to be productive and focused during your first week.


In [7]:
# Evaluate with Trulens
tru_recorder = trulens_recorder(
    sentence_window_engine,
    app_id=f'sentence window engine {sentence_window_size}'
)
run_evals(eval_questions, tru_recorder, sentence_window_engine)
Tru().get_leaderboard(app_ids=[])
Tru().run_dashboard()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.178.25:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

# Auto merging retrieval

In [17]:
automerging_index = build_automerging_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index_0",
    chunk_sizes=[2048,512], #,128
)
automerging_query_engine = get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
)

In [18]:
auto_merging_response = automerging_query_engine.query(
    "How do I build a portfolio of AI projects?"
)
print(str(auto_merging_response))

> Merging 1 nodes into parent node.
> Parent node id: 0007c30e-b930-42db-b170-18c2d8279ae2.
> Parent node text: PAGE 23Each project is only one step on a longer journey, hopefully one that has a positive impac...

> Merging 1 nodes into parent node.
> Parent node id: 3e570d77-b3ac-46c0-9b37-283792849d3d.
> Parent node text: PAGE 15One of the most important skills of an AI architect is the ability to identify ideas that ...

> Merging 1 nodes into parent node.
> Parent node id: 5917d9b8-323c-45ff-9e0a-82ba8772fafa.
> Parent node text: PAGE 21Building a Portfolio of 
Projects that Shows 
Skill Progression CHAPTER 6
PROJECTS

> Merging 1 nodes into parent node.
> Parent node id: 3f85bd58-3867-41dd-9dae-39e3f4929d16.
> Parent node text: PAGE 18It goes without saying that we should only work on projects that are responsible, ethical,...

> Merging 1 nodes into parent node.
> Parent node id: 5ff7ddd2-4578-48c1-b7c9-ae3d5d89e225.
> Parent node text: PAGE 16Determine milestones. Once you’ve de

In [8]:
tru_recorder = trulens_recorder(
    auto_merging_engine_0,
    app_id ='auto merging - 2 layers'
)
run_evals(eval_questions, tru_recorder, auto_merging_engine_0)
Tru().get_leaderboard(app_ids=[])
Tru().run_dashboard()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
> Merging 1 nodes into parent node.
> Parent node id: 1e1db56c-8968-48ec-bdee-6f79efe41ffb.
> Parent node text: PAGE 26If you’re considering a role switch, a startup can be an easier place to do it than a big ...

> Merging 1 nodes into parent node.
> Parent node id: 805e22c4-22af-4f1e-8237-33710099f077.
> Parent node text: PAGE 25Finding a job has a few predictable steps that include s

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>