In [2]:
import networkx as nx
import os
import json
import yaml
from llama_index.core import Document
from llama_index.core import Settings, VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding

In [None]:
repo_path = "../../data/bluexp-dataset"

EXT_HANDLERS = {
    "json": lambda p: json.dumps(json.load(open(p, encoding="utf-8")), indent=2),
    "yaml": lambda p: yaml.safe_dump(yaml.safe_load(open(p, encoding="utf-8")), sort_keys=False),
    "yml" : lambda p: yaml.safe_dump(yaml.safe_load(open(p, encoding="utf-8")), sort_keys=False),
    "adoc": lambda p: open(p, encoding="utf-8").read(),
}

docs = []
for dirpath, _, filenames in os.walk(repo_path):
    for fname in filenames:
        ext = fname.rsplit(".", 1)[-1].lower()
        if ext in EXT_HANDLERS:
            full_path = os.path.join(dirpath, fname)
            try:
                text = EXT_HANDLERS[ext](full_path)
            except Exception as e:
                print(f"⚠️ failed to load {full_path}: {e}")
                continue
            docs.append(Document(text=text, metadata={"source": full_path}))

print(f"✅ Loaded {len(docs)} documents from {repo_path}")


✅ Loaded 193 documents from ../bluexp-dataset


In [4]:
from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='.env')
print("Loaded OPENAI_API_KEY:", os.getenv("OPENAI_API_KEY") is not None)

Loaded OPENAI_API_KEY: True


In [5]:
# Chunking config
Settings.chunk_size = 1000
Settings.chunk_overlap = 200
splitter = SentenceSplitter(chunk_size=Settings.chunk_size,
                            chunk_overlap=Settings.chunk_overlap)
Settings.text_splitter = splitter

# Build & embed
index = VectorStoreIndex.from_documents(
    documents=docs,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    show_progress=True
)

storage_ctx = index._storage_context
docstore    = storage_ctx.docstore

ref_info = docstore.get_all_ref_doc_info()  

all_nodes = docstore.docs

# Preview for the first few chunks of each file
for src, info in ref_info.items():
    print(f"📄 Source file: {src}")
    for node_id in info.node_ids[:3]:
        node = all_nodes[node_id]
        print("  • chunk_id  :", node_id)
        print("    metadata  :", node.metadata)
        print("    preview   :", node.get_content()[:100].replace("\n", " "))
    print("-" * 40)


Parsing nodes:   0%|          | 0/193 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/1053 [00:00<?, ?it/s]

📄 Source file: 00b595d8-b58d-4774-a0aa-ecc29ebba350
  • chunk_id  : 56608a54-47d1-4215-aa5b-d556b2db0cd3
    metadata  : {'source': '../bluexp-dataset/_index.yml'}
    preview   : indexpage:   title: BlueXP API documentation   lead: BlueXP enables you to build, protect, and gover
----------------------------------------
📄 Source file: f66fc940-e7c7-4ddd-9931-3b324003aa12
  • chunk_id  : 2d1e8925-2cfd-4779-922a-8ea77c163b04
    metadata  : {'source': '../bluexp-dataset/project.yml'}
    preview   : settings:   name: API   harmony_integration: maestro   platform: cloud   internal:     pdf_enabled: 
----------------------------------------
📄 Source file: a7790d42-c4b2-4bb6-b8d0-c8578c217a27
  • chunk_id  : 84ba6cfc-b5f8-4bd4-87b2-8b56453e7dea
    metadata  : {'source': '../bluexp-dataset/legal-notices.adoc'}
    preview   : --- sidebar: sidebar permalink: legal-notices.html keywords: copyrights, notice, trademarks, patents
----------------------------------------
📄 Source file: a2469dd2-f

In [6]:
G = nx.DiGraph()
repo_path = "../bluexp-dataset"

for dirpath, subdirs, files in os.walk(repo_path):
    dir_node = os.path.normpath(dirpath)
    G.add_node(dir_node, type="dir")
    for sd in subdirs:
        G.add_edge(dir_node,
                   os.path.normpath(os.path.join(dirpath, sd)),
                   type="contains")
    for f in files:
        file_node = os.path.normpath(os.path.join(dirpath, f))
        G.add_node(file_node, type="file")
        G.add_edge(dir_node, file_node, type="contains")


### Hybrid Approach - Utilizing Generated Embeddings in Tandem with a Directed Graph

In [8]:
retriever = index.as_retriever(similarity_top_k=5)

def hybrid_search(query, top_k=5, α=0.5):
    vec_hits = retriever.retrieve(query)
    
    scored = []
    for hit in vec_hits:
        path      = hit.metadata["source"]
        vec_score = hit.score
        
        # boosting files in small folders to give them more weight
        parent     = os.path.dirname(path)
        sib_count  = len(list(G.successors(parent))) or 1
        graph_boost = 1.0 / sib_count
        
        # combine scores
        combined = α * vec_score + (1 - α) * graph_boost
        scored.append((combined, hit))
    
    return [h for _, h in sorted(scored, key=lambda x: -x[0])]

# Testing:
for r in hybrid_search("how do i setup auth for bluexp", top_k=10):
    print(r.metadata["source"], f"(score: {r.score:.3f})")


../bluexp-dataset/platform/register_service.adoc (score: 0.569)
../bluexp-dataset/platform/get_nss_key.adoc (score: 0.528)
../bluexp-dataset/platform/use_rest_apis.adoc (score: 0.524)
../bluexp-dataset/cm/security.adoc (score: 0.535)
../bluexp-dataset/cm/wf_gcp_cloud_create_we_byol.adoc (score: 0.506)
