In [1]:
import os, re, ast
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === CONFIG ===
ADOC_DIR = "bluexp-automation-main"
TEST_CSV = "bluexp_test_queries.csv"
RESULT_CSV = "bluexp_query_evaluation_results.csv"

In [3]:
# === STEP 1: Load .adoc Files ===
adoc_files = []
for root, dirs, files in os.walk(ADOC_DIR):
    for file in files:
        if file.endswith(".adoc"):
            path = os.path.join(root, file)
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                adoc_files.append({
                    "filename": file,
                    "filepath": path,
                    "content": f.read()
                })

In [4]:
testset = []
for file in adoc_files:
    query = os.path.splitext(file["filename"])[0].replace("_", " ").replace("-", " ")
    permalink_match = re.search(r"permalink:\s*(.+)", file["content"])
    if permalink_match:
        url = f"https://docs.netapp.com/us-en/bluexp-automation/{permalink_match.group(1).strip()}"
        testset.append({"query": query, "expected_url": url})


# Save test queries
testset_df = pd.DataFrame(testset)
testset_df.to_csv(TEST_CSV, index=False)
print(f"✅ Saved test queries to: {TEST_CSV}")

✅ Saved test queries to: bluexp_test_queries.csv


In [6]:
# === STEP 1.5: Add Semantic Tags Using LLM (once only) ===
import openai

openai.api_key = "sk-..."  # Replace with your API key

def get_llm_tags(doc_text):
    try:
        prompt = (
            "You are a documentation tagger. Given the content of a technical documentation file, extract:\n"
            "1. Action type (create, delete, get, update, etc.)\n"
            "2. Primary object or resource\n"
            "3. Context or platform (AWS, ONTAP, GCP, etc.)\n"
            "Return your answer strictly as JSON in this format:\n"
            "{\"action\": ..., \"object\": ..., \"platform\": ...}\n\n"
            f"Content:\n{doc_text[:1000]}"
        )

        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )

        return eval(response["choices"][0]["message"]["content"].strip())
    except Exception as e:
        return {"action": "unknown", "object": "unknown", "platform": "unknown", "error": str(e)}

# Generate tags once and save
for doc in adoc_files:
    tags = get_llm_tags(doc["content"])
    doc["action"] = tags.get("action", "unknown")
    doc["object"] = tags.get("object", "unknown")
    doc["platform"] = tags.get("platform", "unknown")

# Optional: Save to CSV for inspection
pd.DataFrame(adoc_files).to_csv("enriched_adoc_llm_tags.csv", index=False)


In [7]:
# === STEP 2: Enrich Metadata Embed .adoc Files ===
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embed_model)

docs = []
for file in adoc_files:
    content = file["content"]
    keywords = kw_model.extract_keywords(content, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
    topic_tags = [kw[0] for kw in keywords]
    embedding = embed_model.encode(content, convert_to_tensor=True)
    permalink_match = re.search(r"permalink:\s*(.+)", content)
    doc_url = f"https://docs.netapp.com/us-en/bluexp-automation/{permalink_match.group(1).strip()}" if permalink_match else None

    docs.append({
        "filename": file["filename"],
        "doc_url": doc_url,
        "topic_tags": ", ".join(topic_tags),
        "embedding": embedding
    })


In [8]:
# === STEP 3: Semantic Matching ===
def find_best_match(query, docs, top_n=1):
    query_embedding = embed_model.encode(query, convert_to_tensor=True)
    results = []
    for doc in docs:
        score = util.cos_sim(query_embedding, doc["embedding"]).item()
        results.append({
            "filename": doc["filename"],
            "doc_url": doc["doc_url"],
            "topic_tags": doc["topic_tags"],
            "similarity": round(score, 4)
        })
    return pd.DataFrame(results).sort_values(by="similarity", ascending=False).head(top_n)

In [9]:
# #Safewrapper preprocess
# def safe_parse(val):
#     try:
#         parsed = ast.literal_eval(val)
#         return parsed if isinstance(parsed, list) else [parsed]
#     except:
#         return [val]

# test_query_df = pd.read_csv(TEST_CSV)

# # test_query_df["keywords"] = test_query_df["keywords"].apply(safe_parse)
# # test_query_df["urls"] = test_query_df["urls"].apply(safe_parse)

# test_query_df["keywords"] = test_query_df["query"].apply(lambda x: x.split())
# test_query_df["urls"] = test_query_df["expected_url"].apply(lambda x: [x])


test_query_df = pd.read_csv(TEST_CSV)
test_query_df["keywords"] = test_query_df["query"].apply(lambda x: x.split())
test_query_df["urls"] = test_query_df["expected_url"].apply(lambda x: [x])

In [10]:
# === STEP 5: Evaluate All Test Cases ===
evaluation_results = []

for _, row in test_query_df.iterrows():
    query_text = " ".join(row["keywords"])
    expected_urls = row["urls"]

    top_match = find_best_match(query_text, docs, top_n=1)
    predicted_url = top_match.iloc[0]["doc_url"] if not top_match.empty else None
    match = predicted_url in expected_urls

    evaluation_results.append({
        "query": query_text,
        "expected_urls": expected_urls,
        "predicted_url": predicted_url,
        "match": match,
        "similarity": top_match.iloc[0]["similarity"] if not top_match.empty else 0
    })


In [11]:
df = pd.DataFrame(evaluation_results)
df

Unnamed: 0,query,expected_urls,predicted_url,match,similarity
0,legal notices,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.5610
1,blogs,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,False,0.2009
2,overview,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,False,0.2403
3,api reference,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.5070
4,api ref definitions,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.4674
...,...,...,...,...,...
153,register service,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.4110
154,user access tokens,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.6317
155,use rest apis,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,False,0.4932
156,workflows tasks,[https://docs.netapp.com/us-en/bluexp-automati...,https://docs.netapp.com/us-en/bluexp-automatio...,True,0.4496


In [12]:
# === STEP 5: Save or Print Results ===
results_df = pd.DataFrame(evaluation_results)
print(results_df)
results_df.to_csv("query_evaluation_results.csv", index=False)

                    query                                      expected_urls  \
0           legal notices  [https://docs.netapp.com/us-en/bluexp-automati...   
1                   blogs  [https://docs.netapp.com/us-en/bluexp-automati...   
2                overview  [https://docs.netapp.com/us-en/bluexp-automati...   
3           api reference  [https://docs.netapp.com/us-en/bluexp-automati...   
4     api ref definitions  [https://docs.netapp.com/us-en/bluexp-automati...   
..                    ...                                                ...   
153      register service  [https://docs.netapp.com/us-en/bluexp-automati...   
154    user access tokens  [https://docs.netapp.com/us-en/bluexp-automati...   
155         use rest apis  [https://docs.netapp.com/us-en/bluexp-automati...   
156       workflows tasks  [https://docs.netapp.com/us-en/bluexp-automati...   
157  additional resources  [https://docs.netapp.com/us-en/bluexp-automati...   

                                       

In [None]:
for doc in docs:
    print(f"{doc['filename']} → {doc['topic_tags']}")

legal-notices.adoc → copyrights notice, legal notices, notices html, notices include, notice trademarks
blogs.adoc → bluexp cloud, bluexp automation, cloud automation, integrating bluexp, terraform provider
overview.adoc → terraform provider, bluexp cloud, terraform, design terraform, terraform io
api_reference.adoc → api reference, api_reference html, cm api_reference, api_reference, rest api
api_ref_definitions.adoc → api_ref_definitions html, api_ref_definitions, cm api_ref_definitions, apis summary, rest api
api_ref_resources.adoc → azureaccountrequest responses, html azureaccountrequest, azureaccountrequest, azureaccountrequest azureaccountrequest, azure resource
cvo_deployment.adoc → ontap deployment, deploy cloud, administer cloud, bluexp deployment, volumes ontap
overview.adoc → ontap resources, ontap management, resources ontap, ontap storage, bluexp rest
prepare.adoc → bluexp rest, bluexp cloud, service bluexp, ontap cloud, bluexp ontap
security.adoc → bluexp rest, bluexp ser

In [None]:
tag_df = pd.DataFrame([{
    "filename": doc["filename"],
    "doc_url": doc["doc_url"],
    "topic_tags": doc["topic_tags"]
} for doc in docs])

print(tag_df.head())  # Preview
tag_df.to_csv("generated_topic_tags.csv", index=False)

                   filename  \
0        legal-notices.adoc   
1                blogs.adoc   
2             overview.adoc   
3        api_reference.adoc   
4  api_ref_definitions.adoc   

                                             doc_url  \
0  https://docs.netapp.com/us-en/bluexp-automatio...   
1  https://docs.netapp.com/us-en/bluexp-automatio...   
2  https://docs.netapp.com/us-en/bluexp-automatio...   
3  https://docs.netapp.com/us-en/bluexp-automatio...   
4  https://docs.netapp.com/us-en/bluexp-automatio...   

                                          topic_tags  
0  copyrights notice, legal notices, notices html...  
1  bluexp cloud, bluexp automation, cloud automat...  
2  terraform provider, bluexp cloud, terraform, d...  
3  api reference, api_reference html, cm api_refe...  
4  api_ref_definitions html, api_ref_definitions,...  
