In [1]:
import os
from dotenv import load_dotenv
import openai
from llama_index import download_loader, SimpleDirectoryReader, Document
from llama_index.llms import OpenAI
from trulens_eval import Tru

from sentence_window_retrieval import build_sentence_window_index, get_sentence_window_query_engine
from auto_merging_retrieval import build_automerging_index, get_automerging_query_engine
from trulens_utils import trulens_recorder, run_evals

In [2]:
_ = load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [3]:
documents = SimpleDirectoryReader(
    input_files=["./docs/eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [4]:
def get_eval_questions(path_eval_questions):
    eval_questions = []
    with open(path_eval_questions, 'r') as file:
        for line in file:
            # Remove newline character and convert to integer
            item = line.strip()
            eval_questions.append(item)
    return eval_questions

In [5]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

eval_questions = get_eval_questions("./eval_questions/generated_questions_01_05.text")
Tru().reset_database()

🦑 Tru initialized with db url sqlite:///default.sqlite .
🛑 Secret keys may be written to the database. See the `database_redact_keys` option of `Tru` to prevent this.


In [15]:
**Sentence window retrieval**

SyntaxError: invalid syntax (76723346.py, line 1)

In [None]:
# Sentence window retrieval
sentence_window_size = 3
sentence_index = build_sentence_window_index(
    [document],
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=sentence_window_size,
    save_dir="./sentence_index",
)
sentence_window_engine = get_sentence_window_query_engine(sentence_index, similarity_top_k=6)

In [10]:
window_response = sentence_window_engine.query(
    "how do I get started on a personal project in AI?"
)
print(str(window_response))

To get started on a personal project in AI, it is important to first identify and scope the project. Consider your career goals and choose a project that complements them. Ensure that the project is responsible, ethical, and beneficial to people. As you progress in your career, aim to work on projects that grow in scope, complexity, and impact over time. Building a portfolio of projects that shows skill progression can also be helpful. Additionally, there are resources available in the book that provide guidance on starting your AI job search and finding the right AI job for you.


In [7]:
# Evaluate with Trulens
tru_recorder = trulens_recorder(
    sentence_window_engine,
    app_id=f'sentence window engine {sentence_window_size}'
)
run_evals(eval_questions, tru_recorder, sentence_window_engine)
Tru().get_leaderboard(app_ids=[])
Tru().run_dashboard()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.178.25:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [8]:
# Auto merging retrieval
auto_merging_index_0 = build_automerging_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index_0",
    chunk_sizes=[2048,512], #,128
)
auto_merging_engine_0 = get_automerging_query_engine(
    auto_merging_index_0,
    similarity_top_k=12,
    rerank_top_n=6,
)
tru_recorder = trulens_recorder(
    auto_merging_engine_0,
    app_id ='auto merging - 2 layers'
)
run_evals(eval_questions, tru_recorder, auto_merging_engine_0)
Tru().get_leaderboard(app_ids=[])
Tru().run_dashboard()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
> Merging 1 nodes into parent node.
> Parent node id: 1e1db56c-8968-48ec-bdee-6f79efe41ffb.
> Parent node text: PAGE 26If you’re considering a role switch, a startup can be an easier place to do it than a big ...

> Merging 1 nodes into parent node.
> Parent node id: 805e22c4-22af-4f1e-8237-33710099f077.
> Parent node text: PAGE 25Finding a job has a few predictable steps that include s

<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>