# Llamaindex create index

In [1]:
from db_opensearch import opensearch_client
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# import time
# from datetime import datetime
# from pympler import asizeof
# from llama_index.core.node_parser import SimpleNodeParser

  from .autonotebook import tqdm as notebook_tqdm


## helper functions

In [19]:
def fetch_documents_from_opensearch(client, index_name, num_of_docs):
    search_body = {
        "query": {"match_all": {}},  # Fetch all documents
        "_source": ["content", "content_vector", "file_path"]
    }
    response = client.search(index=index_name, body=search_body, size=num_of_docs)

    documents = []
    for hit in response['hits']['hits']:
        source = hit['_source']
        documents.append({
            "content": source["content"],
            "vector": source["content_vector"],  # Vector embeddings
            "metadata": {"file_path": source.get("file_path", "")}  # Add any metadata needed
        })
        
    return documents

# def format_bytes(size):
#     for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']:
#         if size < 1024.0:
#             return f"{size:.2f} {unit}"
#         size /= 1024.0
        
# def format_seconds(seconds):
#     days = int(seconds // 86400)
#     seconds %= 86400
#     hours = int(seconds // 3600)
#     seconds %= 3600
#     minutes = int(seconds // 60)
#     seconds %= 60

#     result = []
#     if days > 0:
#         result.append(f"{days} days")
#     if hours > 0:
#         result.append(f"{hours} hours")
#     if minutes > 0:
#         result.append(f"{minutes} minutes")
#     result.append(f"{seconds:.2f} seconds")

#     return ", ".join(result)

## config llamindex

In [3]:
chunk_size = 4096
Settings.chunk_size = chunk_size

Settings.llm = None

# model_name="dunzhang/stella_en_1.5B_v5"
model_name="BAAI/bge-small-en-v1.5"
Settings.embed_model = HuggingFaceEmbedding(
    model_name=model_name
)

print(f"Embedding model: {model_name}")
print(f"Chunk size: {chunk_size}")

LLM is explicitly disabled. Using MockLLM.
Embedding model: BAAI/bge-small-en-v1.5
Chunk size: 4096


## fetch documents

In [20]:
documents = fetch_documents_from_opensearch(opensearch_client, "documents", 10000)
    
print(f"Documents: {len(documents)}")

Documents: 4695


In [5]:
llama_documents = [
        Document(
            text=doc["content"],
            metadata=doc["metadata"]
        )
        for doc in documents
    ]
print(len(llama_documents))

4695


## Generate embeddings

In [6]:
# Build LlamaIndex
index = VectorStoreIndex.from_documents(llama_documents)

## Test query

In [7]:
query_engine = index.as_query_engine(
    response_mode="no_text",
    similarity_top_k=10
    )

In [14]:
def format_response(response):
    source_nodes = response.source_nodes  # List of source nodes

    # print(source_nodes[0])

    results = [
        {
            "title": node.metadata["file_path"].split('/')[-1],
            "file_path": node.metadata["file_path"],
            "content": node.text[:200],
            "score": node.score,
            "search_type": 'llamaindex',
            # "metadata": node.metadata  # Extract metadata (e.g., file paths, tags)
        }
        for node in source_nodes
    ]
    
    return results

def print_results(query, results):
    print(f"query: {query}")
    for idx, doc in enumerate(results, start=1):
        print(f"Document {idx}: {doc['title']}")
        # print(f"Content: {doc['content'][:200]}")
        # print(f"File path: {doc['file_path']}")
        # print(f"Score: {doc['score']}")
        print()

In [None]:
query = "What is the impact of the GDPR on me?"
response = query_engine.query(query)
results = format_response(response)
print_results(query, results)


## write to file

In [10]:
print("writing index to file")
storage_file = "llamaindex_bge_small"
index.storage_context.persist(storage_file)

writing index to file


In [16]:
query = "Kunnen bedrijven justitiële gegevens verwerken om corruptie te bestrijden?"
response = query_engine.query(query)
results = format_response(response)
print_results(query, results)

query: Kunnen bedrijven justitiële gegevens verwerken om corruptie te bestrijden?
Document 1: advies_32_1998.pdf

Document 2: aanbeveling_02_2016_0.pdf

Document 3: advies_07_1993.pdf

Document 4: beraadslaging_RR_52_2017_0.pdf

Document 5: AD110-2019.pdf

Document 6: advies_02_1992.pdf

Document 7: AVG (de nieuwe Europese privacywet).pdf

Document 8: wp248_rev.01_nl.pdf

Document 9: 12862010_Privacyrecht_Dumortier_VRGAlumni 2f90.pdf

Document 10: Beslissing_GK_22-2020_NL.pdf

