In [1]:
import warnings
import os
import openai
from dotenv import load_dotenv

load_dotenv()
print("hello")
warnings.filterwarnings('ignore')

hello


In [3]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=["./eBook-How-to-Build-a-Career-in-AI.pdf"]
).load_data()
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

41 

<class 'llama_index.core.schema.Document'>
Doc ID: 6d635d3b-0eb7-4605-85b0-66e4dfef1728
Text: PAGE 1Founder, DeepLearning.AICollected Insights from Andrew Ng
How to  Build Your Career in AIA Simple Guide


In [4]:
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))

### Window Sentence Retrieval Setup

In [57]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=2,
    window_metadata_key="window",
    original_text_metadata_key="original_text"
)

In [67]:
text = "Hi. I am Swati. How are you. I am fine"
nodes = node_parser.get_nodes_from_documents([Document(text=text)])
nodes

[TextNode(id_='23ae489e-302a-409d-9ffa-d052c99f0334', embedding=None, metadata={'window': 'Hi.  I am Swati.  How are you. ', 'original_text': 'Hi. '}, excluded_embed_metadata_keys=['window', 'original_text'], excluded_llm_metadata_keys=['window', 'original_text'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='d699f7c0-995b-4d08-9abc-1a46488b0bad', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='ba36131a63e0244945c9bb949ce7e2d6c6fc9b4ccef7b67048b53c1b78dfbed5'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='1a5e780b-4843-4b85-9f59-8ba1a5c35cef', node_type=<ObjectType.TEXT: '1'>, metadata={'window': 'Hi.  I am Swati.  How are you.  I am fine', 'original_text': 'I am Swati. '}, hash='419e22ea2a2c5df38f7564a7f80360a2e3a055fbfbb940c10013395a817d45ed')}, text='Hi. ', start_char_idx=0, end_char_idx=4, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n'),
 TextNode(id_='1a5e780b-4843-4b85-9f59-8ba1

In [59]:
print([x.text for x in nodes])

['Hi. ', 'I am Swati. ', 'How are you. ', 'I am fine']


In [60]:
print(nodes[0].metadata["window"])

Hi.  I am Swati.  How are you. 


In [61]:
text = "hello. foo bar. cat dog. mouse"

nodes = node_parser.get_nodes_from_documents([Document(text=text)])

In [62]:
print([x.text for x in nodes])

['hello. ', 'foo bar. ', 'cat dog. ', 'mouse']


In [66]:
print(nodes[0].metadata["window"])

hello.  foo bar.  cat dog. 


### Building the index

In [22]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

In [23]:
from llama_index.core import ServiceContext

sentence_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    node_parser=node_parser
)

In [24]:
from llama_index.core import VectorStoreIndex

sentence_index = VectorStoreIndex.from_documents(
    [document],
    service_context=sentence_context
)

In [25]:
sentence_index.storage_context.persist(persist_dir="./sentence_index1")

### Building the Postprocessor

In [26]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [27]:
from llama_index.core.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x,score=1.0)for x in nodes]
nodes_old = [deepcopy(n) for n in nodes]

In [30]:
nodes_old[1].text

'I am Swati. '

In [31]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

In [32]:
print(replaced_nodes[1].text)

Hi.  I am Swati.  How are you.  I am fine


### Adding a Reranker

In [33]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

rerank = SentenceTransformerRerank(
    top_n=2,
    model="BAAI/bge-reranker-base"
)

In [34]:
from llama_index.core import QueryBundle
from llama_index.core.schema import TextNode,NodeWithScore

query = QueryBundle("I am Swati")

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a dog"),score=0.6),
    NodeWithScore(node=TextNode(text="Swati wants to buy home"),score=0.4)
]

In [35]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes,query_bundle=query
)

In [36]:
print([(x.text,x.score) for x in reranked_nodes])

[('Swati wants to buy home', 0.1393067), ('This is a dog', 0.009507615)]


### Running the query engine

In [37]:
sentence_window_engine = sentence_index.as_query_engine(
    similarity_top_k = 6,
    node_postprocessor = [postproc,rerank]
)

In [38]:
window_response = sentence_window_engine.query(
    "What are the keys to building a career in AI?"
)

In [39]:
from llama_index.core.response.notebook_utils import display_response

display_response(window_response)

**`Final Response:`** The keys to building a career in AI are learning foundational technical skills, working on projects, finding a job, and networking.

### Putting all together

In [40]:
import os
from llama_index.core import ServiceContext, VectorStoreIndex, StorageContext
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor
from llama_index.core.indices.postprocessor import SentenceTransformerRerank
from llama_index.core import load_index_from_storage


def build_sentence_window_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=3,
    save_dir="sentence_index",
):
    # create the sentence window node parser w/ default settings
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )
    sentence_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
        node_parser=node_parser,
    )
    if not os.path.exists(save_dir):
        sentence_index = VectorStoreIndex.from_documents(
            documents, service_context=sentence_context
        )
        sentence_index.storage_context.persist(persist_dir=save_dir)
    else:
        sentence_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=sentence_context,
        )

    return sentence_index


def get_sentence_window_query_engine(
    sentence_index, similarity_top_k=6, rerank_top_n=2
):
    # define postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )

    sentence_window_engine = sentence_index.as_query_engine(
        similarity_top_k=similarity_top_k, node_postprocessors=[postproc, rerank]
    )
    return sentence_window_engine

In [42]:
from llama_index.llms.openai import OpenAI

index = build_sentence_window_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./sentence_index",
)


In [43]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)


### TruLens Evaluation

In [44]:
eval_questions = []
with open('eval_questions.txt', 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

In [45]:
from trulens_eval import Tru

def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)

In [46]:
from utils import get_prebuilt_trulens_recorder

from trulens_eval import Tru

Tru().reset_database()

âœ… In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
âœ… In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
âœ… In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
âœ… In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
âœ… In Groundedness, input source will be set to __record__.app.retrieve.rets.collect() .
âœ… In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .
ðŸ¦‘ Tru initialized with db url sqlite:///default.sqlite .
ðŸ›‘ Secret keys may be written to the database. See the `database_redact_keys` option of Tru` to prevent this.


### Sentence window size = 1

In [47]:
sentence_index_1 = build_sentence_window_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    sentence_window_size=1,
    save_dir="sentence_index_1",
)

In [48]:
sentence_window_engine_1 = get_sentence_window_query_engine(
    sentence_index_1
)

In [49]:
tru_recorder_1 = get_prebuilt_trulens_recorder(
    sentence_window_engine_1,
    app_id='sentence window engine 1'
)

In [None]:
run_evals(eval_questions, tru_recorder_1, sentence_window_engine_1)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/2 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/3 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\swati\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Groundedness per statement in source:   0%|          | 0/1 [00:00<?, ?it/s]

In [68]:
from trulens_eval import Tru

Tru().get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Groundedness,Answer Relevance,Context Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sentence window engine 1,0.723333,0.89,0.645,4.7,0.000597


In [51]:
Tru().run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valuâ€¦

RuntimeError: Dashboard failed to start in time. Please inspect dashboard logs for additional information.