In [56]:
import os
import numpy as np
import nest_asyncio
import openai

from dotenv import load_dotenv, find_dotenv
from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)
from trulens_eval.feedback import Groundedness
from llama_index import SimpleDirectoryReader
from llama_index import Document
from llama_index import (
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.node_parser import HierarchicalNodeParser
from llama_index.node_parser import get_leaf_nodes
from llama_index import StorageContext, load_index_from_storage
from llama_index.retrievers import AutoMergingRetriever
from llama_index.indices.postprocessor import SentenceTransformerRerank
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.llms import OpenAI
from trulens_eval import Tru

nest_asyncio.apply()


def get_openai_api_key():
    _ = load_dotenv('api.env')

    return os.getenv("OPENAI_API_KEY")


def get_prebuilt_trulens_recorder(query_engine, app_id):
    openai = OpenAI()

    qa_relevance = (
        Feedback(openai.relevance_with_cot_reasons, name="Answer Relevance")
        .on_input_output()
    )

    qs_relevance = (
        Feedback(openai.relevance_with_cot_reasons, name = "Context Relevance")
        .on_input()
        .on(TruLlama.select_source_nodes().node.text)
        .aggregate(np.mean)
    )

#     grounded = Groundedness(groundedness_provider=openai, summarize_provider=openai)
    grounded = Groundedness(groundedness_provider=openai)

    groundedness = (
        Feedback(grounded.groundedness_measure_with_cot_reasons, name="Groundedness")
            .on(TruLlama.select_source_nodes().node.text)
            .on_output()
            .aggregate(grounded.grounded_statements_aggregator)
    )

    feedbacks = [qa_relevance, qs_relevance, groundedness]
    tru_recorder = TruLlama(
        query_engine,
        app_id=app_id,
        feedbacks=feedbacks
    )
    return tru_recorder

In [40]:
openai.api_key = get_openai_api_key()

In [41]:
directory = "/home/theTarnished/Desktop/theTarnished/AI/RAG/pdfss"
files = os.listdir(directory)
arrayFiles = [os.path.join(directory, i) for i in files]

documents = SimpleDirectoryReader(
    input_files=arrayFiles
).load_data()

In [42]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))
print(documents[0])

<class 'list'> 

38 

<class 'llama_index.schema.Document'>
Doc ID: e369ee84-780b-4406-bb66-33357e3f3b23
Text: trabajo final de grado bennett  para optimizar el proceso de
graduación  la upv ideó una solución informática que  facilita la
comunicación entre el tutor  y el estudiante  esta herramienta se
ejecuta en la web  del sistema académico de la upv  el primer paso es
que el jefe de carrera  designe al tutor del trabajo de grado y  luego
de igual form...


In [43]:
document = Document(text="\n\n".join([doc.text for doc in documents]))

In [44]:
def build_automerging_index(
    documents,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index",
    chunk_sizes=None,
):
    chunk_sizes = chunk_sizes or [2048, 512, 128]
    node_parser = HierarchicalNodeParser.from_defaults(chunk_sizes=chunk_sizes)
    nodes = node_parser.get_nodes_from_documents(documents)
    leaf_nodes = get_leaf_nodes(nodes)
    merging_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )
    storage_context = StorageContext.from_defaults()
    storage_context.docstore.add_documents(nodes)

    if not os.path.exists(save_dir):
        automerging_index = VectorStoreIndex(
            leaf_nodes, storage_context=storage_context, service_context=merging_context
        )
        automerging_index.storage_context.persist(persist_dir=save_dir)
    else:
        automerging_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=save_dir),
            service_context=merging_context,
        )
    return automerging_index


def get_automerging_query_engine(
    automerging_index,
    similarity_top_k=12,
    rerank_top_n=6,
):
    base_retriever = automerging_index.as_retriever(similarity_top_k=similarity_top_k)
    retriever = AutoMergingRetriever(
        base_retriever, automerging_index.storage_context, verbose=True
    )
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n, model="BAAI/bge-reranker-base"
    )
    auto_merging_engine = RetrieverQueryEngine.from_args(
        retriever, node_postprocessors=[rerank]
    )
    return auto_merging_engine

def run_evals(eval_questions, tru_recorder, query_engine):
    for question in eval_questions:
        with tru_recorder as recording:
            response = query_engine.query(question)



In [45]:
index = build_automerging_index(
    [document],
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    save_dir="./merging_index",
)

In [46]:
query_engine = get_automerging_query_engine(index, similarity_top_k=6)

In [47]:
Tru().reset_database()

In [48]:
eval_questions = []
directory = "/home/theTarnished/Desktop/theTarnished/AI/RAG/questions.text"
with open(directory, 'r') as file:
    for line in file:
        # Remove newline character and convert to integer
        item = line.strip()
        eval_questions.append(item)

eval_questions

['¿Qué documentos debo presentar en la oficina de Registros para inscribirme?',
 '¿si soy extranjero que debo hacer para inscribirme?',
 '¿cuales son los pasos para inscribirme?',
 '¿que es y como dar una materia incompleta?',
 '¿Cómo obtengo mi credencial estudiantil?',
 '¿Puedo ir al campus con mi vehiculo?',
 '¿La upb tiene transporte propio?',
 '¿que es y como puedo pasar una materia como oyente?',
 '¿cuanto es el costo para dar un examen diferido?',
 '¿Cuales son las becas que ofrece la UPB?',
 '¿Cómo puedo mantener mi beca?',
 '¿Cual es el proceso por el cual puedo hacer mi practica profesional?']

In [49]:
auto_merging_index_1 = build_automerging_index(
    documents,
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0.1),
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="merging_index_1",
    chunk_sizes=[2048,512,128],
)

In [50]:
auto_merging_engine_1 = get_automerging_query_engine(
    auto_merging_index_1,
    similarity_top_k=12,
    rerank_top_n=6,
)

In [51]:
from trulens_eval import (
    Feedback,
    TruLlama,
    OpenAI
)

tru_recorder = get_prebuilt_trulens_recorder(
    auto_merging_engine_1,
    app_id ='app_1'
)

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .
✅ In Context Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input response will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


In [52]:
run_evals(eval_questions, tru_recorder, auto_merging_engine_1)

A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9803f7c050 is calling an instrumented method <function BaseQueryEngine.query at 0x7f97e0fbfec0>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9821b934d0) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9803f7c050 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f97d9151120>. The path of this call may be incorrect.
Guessing path of new object is app based on other object (0x7f9821b934d0) using this function.
A new object of type <class 'llama_index.retrievers.auto_merging_retriever.AutoMergingRetriever'> at 0x7f96e046fb10 is calling an instrumented method <function BaseRetriever.retrieve at 0x7f97e0fbeb60>. The path of this call may be incorrect.
Guessing path of new object is app.retriever based on other object (0x7

> Merging 3 nodes into parent node.
> Parent node id: 6a26c77b-e5a7-478b-b9d6-86fcfc86e178.
> Parent node text: En caso de que tu solicitud 
sea aceptada, deberás pagar en Caja Bs 400, remites al corr eo ya ci...



A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function BaseSynthesizer.synthesize at 0x7f97e0e1e700>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function CompactAndRefine.get_response at 0x7f97e0e1ef20>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function Refine.get_response at 0x7f97e042a840>. The path of this call may be incorrect.
Guessing path of new

> Merging 4 nodes into parent node.
> Parent node id: 7d2e8350-3d97-4089-b482-90fabba627be.
> Parent node text: NOTA. - En caso que el accidente sea de TRÁNSITO prim ero se aplica el SOAT y en exceso el seguro...



A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function CompactAndRefine.get_response at 0x7f97e0e1ef20>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function Refine.get_response at 0x7f97e042a840>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.
A new object of type <class 'llama_index.query_engine.retriever_query_engine.RetrieverQueryEngine'> at 0x7f9803f7c050 is calling an instrumented method <function RetrieverQueryEngine.retrieve at 0x7f97d9151120>. The path of this call may be incorrect.
Guessing path of n

> Merging 5 nodes into parent node.
> Parent node id: 4b81346f-dace-42cd-b795-dd45d3ad4e7e.
> Parent node text: Hola chicos hoy Les vamos a explicar

cuál es el proceso de la práctica

profesional empresarial ...



A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function CompactAndRefine.get_response at 0x7f97e0e1ef20>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.
A new object of type <class 'llama_index.response_synthesizers.compact_and_refine.CompactAndRefine'> at 0x7f9803f81f50 is calling an instrumented method <function Refine.get_response at 0x7f97e042a840>. The path of this call may be incorrect.
Guessing path of new object is app._response_synthesizer based on other object (0x7f9821b91a10) using this function.


In [53]:
Tru().get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Context Relevance,Answer Relevance,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
app_1,0.55,0.985714,2.333333,0.001544


In [54]:
Tru().run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.
Dashboard already running at path:   Network URL: http://192.168.0.3:8501



<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>

In [59]:
pip show pydantic

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: pydantic
Version: 2.5.3
Summary: Data validation using Python type hints
Home-page: 
Author: 
Author-email: Samuel Colvin <s@muelcolvin.com>, Eric Jolibois <em.jolibois@gmail.com>, Hasan Ramezani <hasan.r67@gmail.com>, Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>, Terrence Dorsey <terry@pydantic.dev>, David Montague <david@pydantic.dev>, Serge Matveenko <lig@countzero.co>, Marcelo Trylesinski <marcelotryle@gmail.com>, Sydney Runkle <sydneymarierunkle@gmail.com>, David Hewitt <mail@davidhewitt.io>
License: 
Location: /home/theTarnished/anaconda3/envs/AI/lib/python3.11/site-packages
Requires: annotated-types, pydantic-core, typing-extensions
Required-by: langchain, langchain-core, langsmith, openai, trulens-eval
Note: you may need to restart the kernel to use updated packages.


In [60]:
pip show trulens_eval

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Name: trulens-eval
Version: 0.20.0
Summary: Library to systematically track and evaluate LLM based applications.
Home-page: https://www.trulens.org
Author: Truera Inc
Author-email: all@truera.com
License: MIT
Location: /home/theTarnished/anaconda3/envs/AI/lib/python3.11/site-packages
Requires: alembic, dill, frozendict, humanize, langchain, merkle-json, millify, munch, numpy, pydantic, python-dotenv, sqlalchemy, streamlit, streamlit-aggrid, streamlit-extras, typing-extensions, typing-inspect
Required-by: 
Note: you may need to restart the kernel to use updated packages.
