In [13]:
import pandas as pd

from llama_index import VectorStoreIndex, load_index_from_storage, StorageContext
from llama_index.finetuning import EmbeddingQAFinetuneDataset
from llama_index.evaluation import RetrieverEvaluator
from llama_index.evaluation.retrieval.base import RetrievalEvalMode
from llama_index.evaluation import generate_question_context_pairs
from llama_index.node_parser import SentenceSplitter

from RAGIR.utils.util_functions import (
    initialize_embedding,
    initialize_llama_cpp,
    initialize_service_context,
    load_documents
)

In [14]:
model_path = "/Users/torky/Documents/LlamaCPP-2/zephyr-7b-beta.Q4_0.gguf"
documents_path = "/Users/torky/Documents/thesis-torky/docs/research_papers"
embedding_model_name = "BAAI/bge-small-en-v1.5"

embed_model = initialize_embedding(model_name=embedding_model_name)
llm = initialize_llama_cpp(
    model_path=model_path,
    temperature=0,
    max_new_tokens=1024,
    context_window=3900,
    model_kwargs={"n_gpu_layers": 2},
    verbose=False,
)

service_context = initialize_service_context(
    embed_model=embed_model, llm=None
)

documents = load_documents(documents_path)
node_parser = SentenceSplitter(chunk_size=2048)
nodes = node_parser.get_nodes_from_documents(documents)
vector_index = VectorStoreIndex(nodes, service_context=service_context)
vector_index.storage_context.persist(persist_dir=documents_path+"/index")

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /Users/torky/Documents/LlamaCPP-2/zephyr-7b-beta.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0    

LLM is explicitly disabled. Using MockLLM.


In [2]:
# model_path = "/Users/torky/Documents/LlamaCPP-2/zephyr-7b-beta.Q4_0.gguf"
# documents_path = "/Users/torky/Documents/thesis-torky/docs/test/"
# embedding_model_name = "BAAI/bge-small-en-v1.5"

# embed_model = initialize_embedding(model_name=embedding_model_name)
# llm = initialize_llama_cpp(
#     model_path=model_path,
#     temperature=0,
#     max_new_tokens=1024,
#     context_window=3900,
#     model_kwargs={"n_gpu_layers": 2},
#     verbose=False,
# )

# service_context = initialize_service_context(
#     embed_model=embed_model, llm=None
# )
# storage_context = StorageContext.from_defaults(persist_dir=documents_path+"/index")
# vector_index = load_index_from_storage(storage_context=storage_context, service_context=service_context)

llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from /Users/torky/Documents/LlamaCPP-2/zephyr-7b-beta.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q4_0     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q4_0    

LLM is explicitly disabled. Using MockLLM.


In [15]:
retriever = vector_index.as_retriever(similarity_top_k=5)

In [16]:
qa_dataset = generate_question_context_pairs(
    nodes, llm=llm, num_questions_per_chunk=5
)
qa_dataset.save_json("retriever_eval_dataset.json")

100%|██████████| 445/445 [2:19:13<00:00, 18.77s/it]  


In [4]:
qa_dataset = EmbeddingQAFinetuneDataset.from_json("/Users/torky/Documents/thesis-torky/RAGIR/evaluation/retriever_eval_dataset.json")
qa_dataset

EmbeddingQAFinetuneDataset(queries={'fa1a663f-d86e-478e-94aa-5a4cc63efb5e': 'Based on the given text material, can you summarize the main idea of "Siren’s Song in the AI Ocean : A Survey on Hallucination in Large Language Models"?', '0653f9fd-7826-427f-93a3-5b360f0ef712': 'What is meant by "hallucination" in the context of large language models? Provide an example.', '341f3163-5c4a-435f-b8b1-4a57d912506d': 'Who are the authors of this survey and what are their affiliations?', '1e17e1e0-3a7c-42b4-a219-1421242a8991': 'Can you explain the significance of the term "AI Ocean" used in the title of this survey?', 'ecf3c645-d56b-4ded-9a97-fd1beeaa4699': 'How does the concept of "hallucination" in large language models differ from human hallucinations, if at all? Provide evidence to support your answer.', '88dba055-ee2d-4d2e-9d43-149befea9e26': 'Based on the given text material, can you summarize the main focus of the article "Siren’s Song in the AI Ocean : A Survey on Hallucination in Large La

In [17]:
retriever_evaluator = RetrieverEvaluator.from_metric_names(
    ["mrr", "hit_rate"], retriever=retriever
)

In [18]:
eval_results = await retriever_evaluator.aevaluate_dataset(qa_dataset)

In [19]:
def display_results(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()

    metric_df = pd.DataFrame(
        {"retrievers": [name], "hit_rate": [hit_rate], "mrr": [mrr]}
    )

    return metric_df

In [20]:
display_results("top-5 eval", eval_results)

Unnamed: 0,retrievers,hit_rate,mrr
0,top-5 eval,0.286365,0.209011
