In [1]:
import os, re, ast
import pandas as pd
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#  CONFIG 
ADOC_DIR = "bluexp-automation-main"
TEST_CSV = "bluexp_test_queries.csv"
RESULT_CSV = "bluexp_query_evaluation_results.csv"

In [3]:
#  STEP 1: Load .adoc Files 
adoc_files = []
for root, dirs, files in os.walk(ADOC_DIR):
    for file in files:
        if file.endswith(".adoc"):
            path = os.path.join(root, file)
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                adoc_files.append({
                    "filename": file,
                    "filepath": path,
                    "content": f.read()
                })

In [4]:
#Test set
testset = []
for file in adoc_files:
    query = os.path.splitext(file["filename"])[0].replace("_", " ").replace("-", " ")
    permalink_match = re.search(r"permalink:\s*(.+)", file["content"])
    if permalink_match:
        url = f"https://docs.netapp.com/us-en/bluexp-automation/{permalink_match.group(1).strip()}"
        testset.append({"query": query, "expected_url": url})


# Save test queries
testset_df = pd.DataFrame(testset)
testset_df.to_csv(TEST_CSV, index=False)
print(f"✅ Saved test queries to: {TEST_CSV}")

✅ Saved test queries to: bluexp_test_queries.csv


In [5]:
#  STEP 2: Enrich Metadata Embed .adoc Files 
embed_model = SentenceTransformer("all-MiniLM-L6-v2")
kw_model = KeyBERT(model=embed_model)

docs = []
for file in adoc_files:
    content = file["content"]
    keywords = kw_model.extract_keywords(content, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=5)
    topic_tags = [kw[0] for kw in keywords]
    embedding = embed_model.encode(content, convert_to_tensor=True)
    permalink_match = re.search(r"permalink:\s*(.+)", content)
    doc_url = f"https://docs.netapp.com/us-en/bluexp-automation/{permalink_match.group(1).strip()}" if permalink_match else None

    docs.append({
        "filename": file["filename"],
        "doc_url": doc_url,
        "topic_tags": ", ".join(topic_tags),
        "embedding": embedding
    })


In [6]:
#  STEP 3: Semantic Matching 
def find_best_match(query, docs, top_n=1):
    query_embedding = embed_model.encode(query, convert_to_tensor=True)
    results = []
    for doc in docs:
        score = util.cos_sim(query_embedding, doc["embedding"]).item()
        results.append({
            "filename": doc["filename"],
            "doc_url": doc["doc_url"],
            "topic_tags": doc["topic_tags"],
            "similarity": round(score, 4)
        })
    return pd.DataFrame(results).sort_values(by="similarity", ascending=False).head(top_n)

In [12]:
test_query_df = pd.read_csv(TEST_CSV)
test_query_df["keywords"] = test_query_df["query"].apply(lambda x: x.split())
test_query_df["urls"] = test_query_df["expected_url"].apply(lambda x: [x])

In [8]:
#  STEP 5: Evaluate All Test Cases 
evaluation_results = []

for _, row in test_query_df.iterrows():
    query_text = " ".join(row["keywords"])
    expected_urls = row["urls"]

    top_match = find_best_match(query_text, docs, top_n=1)
    predicted_url = top_match.iloc[0]["doc_url"] if not top_match.empty else None
    match = predicted_url in expected_urls

    evaluation_results.append({
        "query": query_text,
        "expected_urls": expected_urls,
        "predicted_url": predicted_url,
        "match": match,
        "similarity": top_match.iloc[0]["similarity"] if not top_match.empty else 0
    })


In [9]:
#  STEP 6: Save or Print Results 
results_df = pd.DataFrame(evaluation_results)
print(results_df)
results_df.to_csv("query_evaluation_results.csv", index=False)

                    query                                      expected_urls  \
0           legal notices  [https://docs.netapp.com/us-en/bluexp-automati...   
1                   blogs  [https://docs.netapp.com/us-en/bluexp-automati...   
2                overview  [https://docs.netapp.com/us-en/bluexp-automati...   
3           api reference  [https://docs.netapp.com/us-en/bluexp-automati...   
4     api ref definitions  [https://docs.netapp.com/us-en/bluexp-automati...   
..                    ...                                                ...   
153      register service  [https://docs.netapp.com/us-en/bluexp-automati...   
154    user access tokens  [https://docs.netapp.com/us-en/bluexp-automati...   
155         use rest apis  [https://docs.netapp.com/us-en/bluexp-automati...   
156       workflows tasks  [https://docs.netapp.com/us-en/bluexp-automati...   
157  additional resources  [https://docs.netapp.com/us-en/bluexp-automati...   

                                       

In [11]:
# Step 7: Enrich Queries with Semantic Tags and Recalculate Accuracy

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import re

# Load the CSV file (update path if needed)
results_df = pd.read_csv("query_evaluation_results.csv")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Simulated LLM-style tag extractor
def generate_semantic_tags(query):
    keywords = re.findall(
        r'\b(create|get|delete|update|volume|buckets|wes|ontap|gcp|aws|azure|onprem|add|remove|modify|snapshot|replication|kubernetes)\b',
        query.lower()
    )
    return list(set(keywords))

# Enrich each query
def enrich_query(query):
    tags = generate_semantic_tags(query)
    tag_str = " ".join(tags)
    return f"{query} [TAGS: {tag_str}]"

results_df['enriched_query'] = results_df['query'].apply(enrich_query)

# Embed enriched queries and predicted_url (treat as proxy for page text)
query_embeddings = model.encode(results_df['enriched_query'].tolist(), convert_to_tensor=False)
doc_embeddings = model.encode(results_df['predicted_url'].astype(str).tolist(), convert_to_tensor=False)

# Cosine similarity and new match logic
similarities = cosine_similarity(query_embeddings, doc_embeddings).diagonal()
results_df['new_similarity'] = similarities

# Threshold can be adjusted — start with 0.45
threshold = 0.45
results_df['new_match'] = results_df['new_similarity'] > threshold

# Accuracy comparison
old_accuracy = results_df['match'].mean()
new_accuracy = results_df['new_match'].mean()

print(f"🔵 Original Accuracy: {old_accuracy:.2%}")
print(f"🟢 New Accuracy with Semantic Tags: {new_accuracy:.2%}")

# Optional: See improved and worsened examples
improved = results_df[(results_df['match'] == False) & (results_df['new_match'] == True)]
worsened = results_df[(results_df['match'] == True) & (results_df['new_match'] == False)]

print("\n✅ Corrected Matches:")
print(improved[['query', 'predicted_url', 'new_similarity']].head())

print("\n⚠️ New Misses:")
print(worsened[['query', 'predicted_url', 'new_similarity']].head())

# Save updated results if needed
# results_df.to_csv("query_eval_enriched.csv", index=False)


🔵 Original Accuracy: 74.68%
🟢 New Accuracy with Semantic Tags: 81.65%

✅ Corrected Matches:
                              query  \
12  wf aws cloud create we capacity   
34           wf aws ontap get aggrs   
35            wf aws ontap get cifs   
37         wf aws ontap get volumes   
44           wf azure cloud get wes   

                                        predicted_url  new_similarity  
12  https://docs.netapp.com/us-en/bluexp-automatio...        0.692443  
34  https://docs.netapp.com/us-en/bluexp-automatio...        0.652191  
35  https://docs.netapp.com/us-en/bluexp-automatio...        0.573620  
37  https://docs.netapp.com/us-en/bluexp-automatio...        0.578940  
44  https://docs.netapp.com/us-en/bluexp-automatio...        0.538763  

⚠️ New Misses:
                query                                      predicted_url  \
3       api reference  https://docs.netapp.com/us-en/bluexp-automatio...   
8             prepare  https://docs.netapp.com/us-en/bluexp-automatio... 