In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
from IPython.display import JSON
from IPython.display import display

import json

%pip install unstructured-client
%pip install unstructured
%pip install python-pptx
%pip install PyYAML

import os
import yaml
import subprocess
import sys
import yaml

from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError


from unstructured.partition.text import partition_text
from pathlib import Path

%pip install chromadb
import chromadb #in-memory vector database

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Cell 2 Extract documentation texts
def extract_texts(repo_path, extensions=("adoc", "json", "yaml", "yml")):
    """
    Extract texts from files with specified extensions in the given repository path.
    """
    texts = []
    
    # Fix: Process all extensions, not just the first one
    for ext in extensions:
        file_paths = list(Path(repo_path).rglob(f"*.{ext}"))
        print(f"Found {len(file_paths)} .{ext} files")
        
        for path in file_paths:
            try:
                with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    if content.strip():  # Only add non-empty files
                        texts.append((str(path), content, ext))
            except Exception as e:
                print(f"Error reading {path}: {e}")
                continue
    
    return texts

docs = extract_texts("../bluexp-dataset")
print(f"Extracted {len(docs)} files")

# Export extracted docs to a JSON file
try:
    with open('cell_2_extracted_docs.json', 'w', encoding='utf-8') as f:
        json.dump(docs, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Extracted docs exported to 'extracted_docs.json'")
except Exception as e:
    print(f"‚ùå Error exporting extracted docs: {e}")

print("\nEach inner array contains three elements:\n"
    "1. The file path (e.g., '../bluexp-dataset/legal-notices.adoc').\n"
    "2. The content of the file (a long string).\n"
    "3. The file type (e.g., 'adoc').\n"
    "Thus, each JSON array contains 3 elements.")


Found 158 .adoc files
Found 16 .json files
Found 1 .yaml files
Found 18 .yml files
Extracted 193 files
‚úÖ Extracted docs exported to 'extracted_docs.json'

Each inner array contains three elements:
1. The file path (e.g., '../bluexp-dataset/legal-notices.adoc').
2. The content of the file (a long string).
3. The file type (e.g., 'adoc').
Thus, each JSON array contains 3 elements.


## Cleaning and Preparation


In [4]:
##Cell 3

import re
import unicodedata
from typing import List, Tuple

def clean_asciidoc_content(content: str) -> str:
    """
    Comprehensive cleaning function for AsciiDoc content
    """
    # Remove AsciiDoc metadata and frontmatter
    content = re.sub(r'^---[\s\S]*?---\n?', '', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc directives and attributes
    content = re.sub(r'^\s*:[^:\n]+:.*$', '', content, flags=re.MULTILINE)  # :attribute: value
    content = re.sub(r'^\s*\[.*?\]\s*$', '', content, flags=re.MULTILINE)    # [attribute]
    
    # Remove AsciiDoc include directives
    content = re.sub(r'include::[^\[\]]*\[.*?\]', '', content)
    
    # Remove AsciiDoc cross-references and anchors
    content = re.sub(r'<<[^>]*>>', '', content)
    content = re.sub(r'\[\[.*?\]\]', '', content)
    content = re.sub(r'anchor:[^\[\]]*\[.*?\]', '', content)
    
    # Remove AsciiDoc links but keep the text
    content = re.sub(r'https?://[^\s\[\]]+\[([^\]]*)\]', r'\1', content)  # link[text] -> text
    content = re.sub(r'link:[^\[\]]+\[([^\]]*)\]', r'\1', content)         # link:url[text] -> text
    
    # Remove or replace AsciiDoc formatting
    content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content)  # **bold** -> text
    content = re.sub(r'\*([^*]+)\*', r'\1', content)      # *emphasis* -> text
    content = re.sub(r'`([^`]+)`', r'\1', content)        # `code` -> code
    content = re.sub(r'_([^_]+)_', r'\1', content)        # _italic_ -> text
    
    # Remove AsciiDoc headers but keep the text
    content = re.sub(r'^=+\s*(.+)$', r'\1', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc comments
    content = re.sub(r'^//.*$', '', content, flags=re.MULTILINE)
    content = re.sub(r'////[\s\S]*?////', '', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc tables and lists formatting
    content = re.sub(r'^\|.*$', '', content, flags=re.MULTILINE)  # Table rows
    content = re.sub(r'^\s*\*+\s+', '', content, flags=re.MULTILINE)  # Bullet lists
    content = re.sub(r'^\s*\.+\s+', '', content, flags=re.MULTILINE)  # Numbered lists
    
    # Remove AsciiDoc blocks
    content = re.sub(r'^-{4,}$.*?^-{4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    content = re.sub(r'^={4,}$.*?^={4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    content = re.sub(r'^\+{4,}$.*?^\+{4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    
    return content

def clean_general_text(text: str) -> str:
    """
    General text cleaning for any content type
    """
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)
    
    # Remove or replace special characters
    text = re.sub(r'[^\w\s\.\,\!\?\-\(\)\:\;]', ' ', text)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
    text = re.sub(r'\n+', ' ', text)  # Multiple newlines to space
    text = re.sub(r'\t+', ' ', text)  # Tabs to space
    
    # Remove excessive punctuation
    text = re.sub(r'[\.]{3,}', '...', text)  # Multiple dots to ellipsis
    text = re.sub(r'[!]{2,}', '!', text)     # Multiple exclamations
    text = re.sub(r'[\?]{2,}', '?', text)    # Multiple questions
    
    # Remove standalone punctuation and short meaningless sequences
    text = re.sub(r'\b[^\w\s]{1,3}\b', ' ', text)
    
    # Clean up sentence structure
    text = re.sub(r'\s+([\.!?])', r'\1', text)  # Remove space before punctuation
    text = re.sub(r'([\.!?])\s*([a-z])', r'\1 \2', text)  # Ensure space after sentence end

    return text.strip()

cleaned_training_docs = []


for file_path, content, file_type in docs:
    print(f"Processing: {file_path}")
    
    # Step 1: Clean AsciiDoc-specific content if applicable
    if file_type == 'adoc':
        cleaned_content = clean_asciidoc_content(content)
    else:
        cleaned_content = content
    
    # Step 2: General text cleaning
    cleaned_content = clean_general_text(cleaned_content)

    # Append the cleaned content and file type to the training_texts list
    cleaned_training_docs.append((file_path, cleaned_content, file_type))

# Export cleaned training texts to a JSON file
output_file = 'cell_3_cleaned_training_docs.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_training_docs, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Cleaned training texts exported to '{output_file}'")
except Exception as e:
    print(f"‚ùå Error exporting cleaned training texts: {e}")



Processing: ../bluexp-dataset/legal-notices.adoc
Processing: ../bluexp-dataset/platform/aa_concepts.adoc
Processing: ../bluexp-dataset/platform/create_user_token.adoc
Processing: ../bluexp-dataset/platform/http_details.adoc
Processing: ../bluexp-dataset/platform/get_identifiers.adoc
Processing: ../bluexp-dataset/platform/workflows_tasks.adoc
Processing: ../bluexp-dataset/platform/user_access_tokens.adoc
Processing: ../bluexp-dataset/platform/concepts.adoc
Processing: ../bluexp-dataset/platform/overview.adoc
Processing: ../bluexp-dataset/platform/register_service.adoc
Processing: ../bluexp-dataset/platform/api_explorer.adoc
Processing: ../bluexp-dataset/platform/get_nss_key.adoc
Processing: ../bluexp-dataset/platform/create_service_token.adoc
Processing: ../bluexp-dataset/platform/additional_considerations.adoc
Processing: ../bluexp-dataset/platform/use_rest_apis.adoc
Processing: ../bluexp-dataset/platform/connectors_clients.adoc
Processing: ../bluexp-dataset/platform/grant_types.adoc
P

## Process cleaned docs with unstructured library

In [5]:
##Cell 4
import json

def process_partition_text(docs):
    all_elements = []

    for i, doc in enumerate(docs):
        file_path = doc[0]
        print(f"Processing file: {file_path}")
        try:
            # Use partition_text for direct content processing
            elements = partition_text(filename=file_path)
            element_dict = [el.to_dict() for el in elements]
            all_elements.extend(element_dict)

            print(f"Extracted {len(element_dict)} elements from {doc[0]}")
        except ValueError as e:
            print(f"Error processing {doc[0]}: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")

    return all_elements

doc_element_results = process_partition_text(cleaned_training_docs)

Processing file: ../bluexp-dataset/legal-notices.adoc
Extracted 9 elements from ../bluexp-dataset/legal-notices.adoc
Processing file: ../bluexp-dataset/platform/aa_concepts.adoc
Extracted 21 elements from ../bluexp-dataset/platform/aa_concepts.adoc
Processing file: ../bluexp-dataset/platform/create_user_token.adoc
Extracted 124 elements from ../bluexp-dataset/platform/create_user_token.adoc
Processing file: ../bluexp-dataset/platform/http_details.adoc
Extracted 24 elements from ../bluexp-dataset/platform/http_details.adoc
Processing file: ../bluexp-dataset/platform/get_identifiers.adoc
Extracted 47 elements from ../bluexp-dataset/platform/get_identifiers.adoc
Processing file: ../bluexp-dataset/platform/workflows_tasks.adoc
Extracted 42 elements from ../bluexp-dataset/platform/workflows_tasks.adoc
Processing file: ../bluexp-dataset/platform/user_access_tokens.adoc
Extracted 63 elements from ../bluexp-dataset/platform/user_access_tokens.adoc
Processing file: ../bluexp-dataset/platform/co

In [6]:
# Export cleaned_docs to JSON after unstructured processing
try:
    with open('cell_4_cleaned_unstructured_doc.json', 'w', encoding='utf-8') as f:
        json.dump(doc_element_results, f, indent=2, ensure_ascii=False)
    
    print(f"   ‚úÖ Cleaned docs exported to 'cleaned_docs_final_v2.json'")
    print(f"   üìÑ File size: {Path('cleaned_docs_final.json').stat().st_size / 1024:.1f} KB")
    print(f"   üìä Contains {len(doc_element_results)} cleaned documents")
    
except Exception as e:
    print(f"   ‚ùå Error exporting cleaned_docs: {e}")

   ‚úÖ Cleaned docs exported to 'cleaned_docs_final_v2.json'
   ‚ùå Error exporting cleaned_docs: [Errno 2] No such file or directory: 'cleaned_docs_final.json'


## Load documents into a vector db

In [7]:
client = chromadb.PersistentClient(path="chroma_tmp", settings=chromadb.Settings(allow_reset=True))
client.reset()

True

In [8]:
# Create a new collection named "winter_sports" using the client object.
# The collection is configured with metadata specifying the similarity metric
# for the HNSW (Hierarchical Navigable Small World) algorithm as "cosine".
# "cosine" is typically used to measure the similarity between vectors.
ollection = client.create_collection(
    name="winter_sports",
    metadata={"hnsw:space": "cosine"}
)

In [9]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def process_document(doc):
    try:
        specific_metadata = {
            "file_directory": doc['metadata'].get("file_directory"),
            "filename": doc['metadata'].get("filename"),
            "filetype": doc['metadata'].get("filetype")
        }
        collection.add(
            documents=[doc['text']],
            metadatas=[specific_metadata],
            ids=doc["element_id"]
        )
    except Exception as e:
        print(f"Error adding document: {e}")

# Increase the number of threads
MAX_THREADS = 10  # Adjust this value based on your system's capabilities

with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
    list(tqdm(executor.map(process_document, doc_element_results), total=len(doc_element_results), desc="Processing documents"))

Processing documents:   3%|‚ñé         | 2796/86063 [04:01<11:47:50,  1.96it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:   7%|‚ñã         | 5953/86063 [08:01<8:46:18,  2.54it/s] 

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  14%|‚ñà‚ñç        | 11920/86063 [16:10<10:55:48,  1.88it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  21%|‚ñà‚ñà        | 17931/86063 [24:10<7:05:13,  2.67it/s] 

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  31%|‚ñà‚ñà‚ñà‚ñè      | 27043/86063 [36:17<1:14:38, 13.18it/s] 

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  35%|‚ñà‚ñà‚ñà‚ñç      | 30039/86063 [40:19<1:51:15,  8.39it/s] 

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  45%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 38809/86063 [52:30<4:49:46,  2.72it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  49%|‚ñà‚ñà‚ñà‚ñà‚ñâ     | 42043/86063 [56:34<3:08:03,  3.90it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 44960/86063 [1:00:39<4:13:30,  2.70it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  59%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ    | 51090/86063 [1:40:37<04:59, 116.63it/s]     

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'



Processing documents:  63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 54096/86063 [1:44:35<18:06, 29.42it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 57105/86063 [1:48:33<10:15, 47.05it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  73%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 63114/86063 [1:59:06<10:05, 37.88it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 66128/86063 [2:04:15<04:47, 69.31it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 72052/86063 [2:14:39<2:26:55,  1.59it/s]

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 75111/86063 [2:19:43<05:37, 32.46it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 78009/86063 [2:25:02<55:50,  2.40it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 81102/86063 [2:30:08<02:34, 32.07it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'



Processing documents:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 84154/86063 [2:35:26<00:26, 72.66it/s]  

Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'
Error adding document: '8a1521f0-d952-44c4-8f4d-99b77570579aCollectionAddEvent'


Processing documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 86063/86063 [2:38:42<00:00,  9.04it/s] 


## See the elements in Vector DB

In [10]:
results = collection.peek()
print(results["documents"])

['--- sidebar: sidebar permalink: legal-notices.html keywords: copyrights, notice, trademarks, patents, privacy, open source summary: Legal notices provide access to copyright statements, trademarks, patents, and more. ---', ':nofooter:', ':imagesdir: ./media/', ':hardbreaks:', '// Include the common notices include::https://raw.githubusercontent.com/NetAppDocs/common/main/_include/common-legal-notices.adoc[]', '= Legal notices', '[.lead]', '--- sidebar: sidebar permalink: platform/aa_concepts.html keywords: bluexp, authorization, authentication, oauth2.0, access tokens, rest, apis summary: You should be familiar with the basic authentication and authorization concepts before using the BlueXP REST APIs. ---', ':linkattrs:', '[.lead] You should be familiar with the basic authentication and authorization concepts before using the BlueXP REST APIs.']


## Perform a hybrid search with metadata

In [11]:
result = collection.query(
    query_texts=["How do i setup auth for bluexp?"],
    n_results=5
)

print(json.dumps(result, indent=2))

{
  "ids": [
    [
      "901ae61b122c1572f9f39bb9e6b81535",
      "cb52a897d83f690d8f6acbffa0905b63",
      "ef19837a8ce5bb716956c72994b89c9a",
      "7cd85525616841d7226e13a58206b89a",
      "99ba15e14646096104dc1dc7061e7cc7"
    ]
  ],
  "embeddings": null,
  "documents": [
    [
      "Authentication resources You need to authenticate using a BlueXP account to run the API operations.",
      "bxp_bearer_auth:",
      "All NetApp BlueXP services use OAuth 2.0 for authorization. OAuth 2.0 is an open standard implemented by several authorization providers including *Auth0*. Connecting and communicating with a secure REST endpoint is a two-step process:",
      "bxp_bearer_auth: []",
      ".Have BlueXP credentials You'll need a NetApp account to acquire an access token required to issue the REST API calls."
    ]
  ],
  "uris": null,
  "included": [
    "metadatas",
    "documents",
    "distances"
  ],
  "data": null,
  "metadatas": [
    [
      {
        "filetype": "text/plain",
 