# Llamaindex create index

In [None]:
from db_opensearch import opensearch_client
from llama_index.core import VectorStoreIndex, Document, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
# import time
# from datetime import datetime
# from pympler import asizeof
# from llama_index.core.node_parser import SimpleNodeParser

## helper functions

In [None]:
def fetch_documents_from_opensearch(client, index_name, num_of_docs):
    search_body = {
        "query": {"match_all": {}},  # Fetch all documents
        "_source": ["content", "content_vector", "file_path"]
    }
    response = client.search(index=index_name, body=search_body, size=num_of_docs)

    documents = []
    for hit in response['hits']['hits']:
        source = hit['_source']
        documents.append({
            "content": source["content"],
            "vector": source["content_vector"],  # Vector embeddings
            "metadata": {"file_path": source.get("file_path", "")}  # Add any metadata needed
        })
        
    return documents

# def format_bytes(size):
#     for unit in ['bytes', 'KB', 'MB', 'GB', 'TB']:
#         if size < 1024.0:
#             return f"{size:.2f} {unit}"
#         size /= 1024.0
        
# def format_seconds(seconds):
#     days = int(seconds // 86400)
#     seconds %= 86400
#     hours = int(seconds // 3600)
#     seconds %= 3600
#     minutes = int(seconds // 60)
#     seconds %= 60

#     result = []
#     if days > 0:
#         result.append(f"{days} days")
#     if hours > 0:
#         result.append(f"{hours} hours")
#     if minutes > 0:
#         result.append(f"{minutes} minutes")
#     result.append(f"{seconds:.2f} seconds")

#     return ", ".join(result)

## config llamindex

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
!pip3 install llama-index-embeddings-instructor

In [None]:
Settings.embed_model = SentenceTransformer("sentence-transformers/LaBSE")

In [None]:
# chunk_size = 4096
chunk_size = 512
Settings.chunk_size = chunk_size

Settings.llm = None

# model_name="dunzhang/stella_en_1.5B_v5"
# model_name="BAAI/bge-small-en-v1.5"
# model_name="jinaai/jina-embeddings-v3"

# model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

model_name="sentence-transformers/LaBSE"
Settings.embed_model = HuggingFaceEmbedding(
    model_name=model_name

)

print(f"Embedding model: {model_name}")
print(f"Chunk size: {chunk_size}")

## fetch documents

In [None]:
documents = fetch_documents_from_opensearch(opensearch_client, "documents", 10000)
    
print(f"Documents: {len(documents)}")

In [None]:
llama_documents = [
        Document(
            text=doc["content"],
            metadata=doc["metadata"]
        )
        for doc in documents
    ]
print(len(llama_documents))

## Generate embeddings

In [None]:
# Build LlamaIndex
index = VectorStoreIndex.from_documents(llama_documents)

## Test query

In [None]:
query_engine = index.as_query_engine(
    response_mode="no_text",
    similarity_top_k=20
    )

In [None]:
# from llama_index.core.query_engine import RetrieverQueryEngine

In [None]:
# query_engine2 = RetrieverQueryEngine(
#     retriever=index.as_retriever(
#         # response_mode="no_text",
#         similarity_top_k=10
#         )
#     )   

In [None]:
def format_response(response):
    source_nodes = response.source_nodes 
    
    grouped_results = {}
    for node in source_nodes:
        file_path = node.metadata["file_path"]
        if file_path not in grouped_results:
            grouped_results[file_path] = {
                "title": file_path.split('/')[-1],
                "file_path": file_path,
                "content": node.text[:200],
                "score": node.score,
                "search_type": 'llamaindex'
            }
        else:
            # Optionally, update with a higher score or merge content
            grouped_results[file_path]["score"] = max(grouped_results[file_path]["score"], node.score)
    return list(grouped_results.values())[:10]

def print_results(query, results):
    print(f"query: {query}")
    for idx, doc in enumerate(results, start=1):
        print(f"Document {idx}: {doc['title']}")
        # print(f"Content: {doc['content'][:200]}")
        # print(f"File path: {doc['file_path']}")
        # print(f"Score: {doc['score']}")
        print()

In [None]:
query="corruption"
response = query_engine.query(query)
results = format_response(response)
print_results(query, results)

## write to file

In [None]:
print("writing index to file")
storage_file = "llamaindex_labse"
index.storage_context.persist(storage_file)