In [1]:
%%time
from huggingface_hub import login
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
import os

CPU times: total: 7.75 s
Wall time: 8.31 s









In [2]:
from huggingface_hub import login
from llama_index.core import SimpleDirectoryReader

In [3]:
from llama_index.llms.ollama import Ollama
llm = Ollama(model="llama3.2:3b", request_timeout=100.0)

In [4]:
folder_path = './colab_data'
file_name = 'London_Love_Life.pdf'  

file_path = os.path.join(folder_path, file_name)
if os.path.exists(file_path):
    print(f"{file_name} found in {file_path}.")
else:
    print(f"{file_name} not found in {folder_path}.")


reader = SimpleDirectoryReader(input_files=[file_path])
document = reader.load_data()

chunks_doc = len(document)
print("chunks_doc", chunks_doc)

London_Love_Life.pdf found in ./colab_data\London_Love_Life.pdf.
chunks_doc 20


In [5]:
%%time
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import chromadb
from llama_index.core import Settings
from llama_index.core import StorageContext
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Document
from llama_index.core.settings import Settings


db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("DB_collection")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large")


reader = SimpleDirectoryReader(input_files=[file_path])
documents = reader.load_data()
index = VectorStoreIndex.from_documents(documents, 
                                       transformations=[SentenceSplitter(chunk_size=1024, chunk_overlap=200)],
                                        embed_model=HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-large")
                                       )


CPU times: total: 6min 56s
Wall time: 59.8 s


In [6]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset
qa_dataset = generate_question_context_pairs(documents, llm=llm, num_questions_per_chunk=4)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [00:36<00:00,  1.80s/it]


In [9]:
from llama_index.core.evaluation import generate_question_context_pairs, EmbeddingQAFinetuneDataset
from llama_index.core.schema import TextNode

def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in queries.items():
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids 

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

eval_results = evaluate(qa_dataset , embed_model=Settings.embed_model, top_k=10, verbose=False)
print("Evaluation results:", eval_results)

Generating embeddings:   0%|          | 0/20 [00:00<?, ?it/s]

Evaluation results: [{'is_hit': False, 'retrieved': ['bb68c6c5-6a7c-44af-9a2f-b754ac287f4e', 'dbabe5a1-fc94-45e0-8283-419914cbf264', '3ab93ea8-72df-45c4-bbf4-e09f8c4ec7b2', '28b7f569-31b3-459c-be7b-c8e326eaa340', '8fa057e2-24fc-40ae-b32b-fe467fcb7466', '82225575-1bd9-483c-a298-b25c0bc5cac7', '04dd36c9-f349-4b27-ad6d-1967dd5f0e80', 'af52ef5b-dbe0-4826-8b13-fed05ecb5025', '4c4a381e-07ea-4649-bf1f-acf7a2911172', '18dddd0a-53b5-437b-bac5-df911d10684c'], 'expected': '46401432-2095-4a7e-95ea-b75aa41230d7', 'query': '273b3c89-8ae2-4ce1-b5a6-0a262cb72102'}, {'is_hit': True, 'retrieved': ['46401432-2095-4a7e-95ea-b75aa41230d7', 'eb49602b-b494-4b4a-89f1-dc9965a497a9', '82225575-1bd9-483c-a298-b25c0bc5cac7', '795ec3fd-ded7-4858-8315-412525075a0c', 'dbabe5a1-fc94-45e0-8283-419914cbf264', 'bb68c6c5-6a7c-44af-9a2f-b754ac287f4e', '3ab93ea8-72df-45c4-bbf4-e09f8c4ec7b2', 'e62e2d88-e6f1-44f1-b036-d5845b04021b', '66ca8265-b512-414d-8f79-133b69ad9c77', '28b7f569-31b3-459c-be7b-c8e326eaa340'], 'expected': 

In [10]:
def calculate_metrics(eval_results):
    precision_sum = 0
    recall_sum = 0
    num_queries = len(eval_results)

    for result in eval_results:
        if result['is_hit']:
            precision_sum += 1
            recall_sum += 1 / (result['retrieved'].index(result['expected']) + 1)

    precision = precision_sum / num_queries
    recall = recall_sum / num_queries
    f1_score = 2 * (precision * recall) / (precision + recall)

    return precision, recall, f1_score

precision, recall, f1_score = calculate_metrics(eval_results)
print("Accuracy:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Accuracy: 0.8125
Recall: 0.45709325396825407
F1-score: 0.5850507911701506
