In [10]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [10]:
from IPython.display import display

import json

%pip install unstructured-client
%pip install unstructured
%pip install python-pptx
%pip install PyYAML

import os
import yaml

from unstructured.partition.text import partition_text
from IPython.display import clear_output


from pathlib import Path

clear_output()


In [11]:
# Cell 2 Extract documentation texts
def extract_texts(repo_path, extensions=("adoc", "json", "yaml", "yml")):
    """
    Extract texts from files with specified extensions in the given repository path.
    """
    texts = []
    
    # Fix: Process all extensions, not just the first one
    for ext in extensions:
        file_paths = list(Path(repo_path).rglob(f"*.{ext}"))
        print(f"Found {len(file_paths)} .{ext} files")
        
        for path in file_paths:
            try:
                with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    if content.strip():  # Only add non-empty files
                        texts.append((str(path), content, ext))
            except Exception as e:
                print(f"Error reading {path}: {e}")
                continue
    
    return texts

docs = extract_texts("../bluexp-dataset")
print(f"Extracted {len(docs)} files")

# Export extracted docs to a JSON file
try:
    with open('cell_2_extracted_docs.json', 'w', encoding='utf-8') as f:
        json.dump(docs, f, indent=2, ensure_ascii=False)
    print(f"✅ Extracted docs exported to 'extracted_docs.json'")
except Exception as e:
    print(f"❌ Error exporting extracted docs: {e}")

print("\nEach inner array contains three elements:\n"
    "1. The file path (e.g., '../bluexp-dataset/legal-notices.adoc').\n"
    "2. The content of the file (a long string).\n"
    "3. The file type (e.g., 'adoc').\n"
    "Thus, each JSON array contains 3 elements.")




Found 158 .adoc files
Found 16 .json files
Found 1 .yaml files
Found 18 .yml files
Extracted 193 files
✅ Extracted docs exported to 'extracted_docs.json'

Each inner array contains three elements:
1. The file path (e.g., '../bluexp-dataset/legal-notices.adoc').
2. The content of the file (a long string).
3. The file type (e.g., 'adoc').
Thus, each JSON array contains 3 elements.


## Cleaning and Preparation


In [12]:
##Cell 3

import re
import unicodedata
from typing import List, Tuple

def clean_asciidoc_content(content: str) -> str:
    """
    Comprehensive cleaning function for AsciiDoc content
    """
    # Remove AsciiDoc metadata and frontmatter
    content = re.sub(r'^---[\s\S]*?---\n?', '', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc directives and attributes
    content = re.sub(r'^\s*:[^:\n]+:.*$', '', content, flags=re.MULTILINE)  # :attribute: value
    content = re.sub(r'^\s*\[.*?\]\s*$', '', content, flags=re.MULTILINE)    # [attribute]
    
    # Remove AsciiDoc include directives
    content = re.sub(r'include::[^\[\]]*\[.*?\]', '', content)
    
    # Remove AsciiDoc cross-references and anchors
    content = re.sub(r'<<[^>]*>>', '', content)
    content = re.sub(r'\[\[.*?\]\]', '', content)
    content = re.sub(r'anchor:[^\[\]]*\[.*?\]', '', content)
    
    # Remove AsciiDoc links but keep the text
    content = re.sub(r'https?://[^\s\[\]]+\[([^\]]*)\]', r'\1', content)  # link[text] -> text
    content = re.sub(r'link:[^\[\]]+\[([^\]]*)\]', r'\1', content)         # link:url[text] -> text
    
    # Remove or replace AsciiDoc formatting
    content = re.sub(r'\*\*([^*]+)\*\*', r'\1', content)  # **bold** -> text
    content = re.sub(r'\*([^*]+)\*', r'\1', content)      # *emphasis* -> text
    content = re.sub(r'`([^`]+)`', r'\1', content)        # `code` -> code
    content = re.sub(r'_([^_]+)_', r'\1', content)        # _italic_ -> text
    
    # Remove AsciiDoc headers but keep the text
    content = re.sub(r'^=+\s*(.+)$', r'\1', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc comments
    content = re.sub(r'^//.*$', '', content, flags=re.MULTILINE)
    content = re.sub(r'////[\s\S]*?////', '', content, flags=re.MULTILINE)
    
    # Remove AsciiDoc tables and lists formatting
    content = re.sub(r'^\|.*$', '', content, flags=re.MULTILINE)  # Table rows
    content = re.sub(r'^\s*\*+\s+', '', content, flags=re.MULTILINE)  # Bullet lists
    content = re.sub(r'^\s*\.+\s+', '', content, flags=re.MULTILINE)  # Numbered lists
    
    # Remove AsciiDoc blocks
    content = re.sub(r'^-{4,}$.*?^-{4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    content = re.sub(r'^={4,}$.*?^={4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    content = re.sub(r'^\+{4,}$.*?^\+{4,}$', '', content, flags=re.MULTILINE | re.DOTALL)
    
    return content

def clean_general_text(text: str) -> str:
    """
    General text cleaning for any content type
    """
    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)
    
    # Remove or replace special characters
    text = re.sub(r'[^\w\s\.\,\!\?\-\(\)\:\;]', ' ', text)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
    text = re.sub(r'\n+', ' ', text)  # Multiple newlines to space
    text = re.sub(r'\t+', ' ', text)  # Tabs to space
    
    # Remove excessive punctuation
    text = re.sub(r'[\.]{3,}', '...', text)  # Multiple dots to ellipsis
    text = re.sub(r'[!]{2,}', '!', text)     # Multiple exclamations
    text = re.sub(r'[\?]{2,}', '?', text)    # Multiple questions
    
    # Remove standalone punctuation and short meaningless sequences
    text = re.sub(r'\b[^\w\s]{1,3}\b', ' ', text)
    
    # Clean up sentence structure
    text = re.sub(r'\s+([\.!?])', r'\1', text)  # Remove space before punctuation
    text = re.sub(r'([\.!?])\s*([a-z])', r'\1 \2', text)  # Ensure space after sentence end

    return text.strip()

cleaned_training_docs = []


for file_path, content, file_type in docs:
    print(f"Processing: {file_path}")
    
    # Step 1: Clean AsciiDoc-specific content if applicable
    if file_type == 'adoc':
        cleaned_content = clean_asciidoc_content(content)
    else:
        cleaned_content = content
    
    # Step 2: General text cleaning
    cleaned_content = clean_general_text(cleaned_content)

    # Append the cleaned content and file type to the training_texts list
    cleaned_training_docs.append((file_path, cleaned_content, file_type))

# Export cleaned training texts to a JSON file
output_file = 'cell_3_cleaned_training_docs.json'
try:
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(cleaned_training_docs, f, indent=2, ensure_ascii=False)
    print(f"✅ Cleaned training texts exported to '{output_file}'")
except Exception as e:
    print(f"❌ Error exporting cleaned training texts: {e}")



Processing: ../bluexp-dataset/legal-notices.adoc
Processing: ../bluexp-dataset/platform/aa_concepts.adoc
Processing: ../bluexp-dataset/platform/create_user_token.adoc
Processing: ../bluexp-dataset/platform/http_details.adoc
Processing: ../bluexp-dataset/platform/get_identifiers.adoc
Processing: ../bluexp-dataset/platform/workflows_tasks.adoc
Processing: ../bluexp-dataset/platform/user_access_tokens.adoc
Processing: ../bluexp-dataset/platform/concepts.adoc
Processing: ../bluexp-dataset/platform/overview.adoc
Processing: ../bluexp-dataset/platform/register_service.adoc
Processing: ../bluexp-dataset/platform/api_explorer.adoc
Processing: ../bluexp-dataset/platform/get_nss_key.adoc
Processing: ../bluexp-dataset/platform/create_service_token.adoc
Processing: ../bluexp-dataset/platform/additional_considerations.adoc
Processing: ../bluexp-dataset/platform/use_rest_apis.adoc
Processing: ../bluexp-dataset/platform/connectors_clients.adoc
Processing: ../bluexp-dataset/platform/grant_types.adoc
P

## Process cleaned docs with unstructured library

In [13]:
##Cell 4
import json

def categorize_content(text: str) -> str:
    """
    Categorize content type for better training data organization
    """
    text_lower = text.lower()
    
    # API documentation
    if any(keyword in text_lower for keyword in ['api', 'endpoint', 'request', 'response', 'json', 'curl']):
        return 'api'
    
    # Installation/Setup
    if any(keyword in text_lower for keyword in ['install', 'setup', 'configure', 'requirement', 'prerequisite']):
        return 'installation'
    
    # Tutorial/How-to
    if any(keyword in text_lower for keyword in ['tutorial', 'how to', 'step by step', 'guide', 'walkthrough']):
        return 'tutorial'
    
    # Configuration
    if any(keyword in text_lower for keyword in ['config', 'setting', 'parameter', 'option', 'property']):
        return 'configuration'
    
    # Troubleshooting
    if any(keyword in text_lower for keyword in ['error', 'troubleshoot', 'problem', 'issue', 'debug']):
        return 'troubleshooting'
    
    # Reference
    if any(keyword in text_lower for keyword in ['reference', 'spec', 'documentation', 'manual']):
        return 'reference'
    
    return 'general'



def process_partition_text(docs):
    all_elements = []

    for i, doc in enumerate(docs):
        file_path = doc[0]

        chunk_words = set(re.findall(r'\b[a-zA-Z]{2,}\b', doc[1].lower()))

        try:
            print(f"Processing file: {file_path} and lenght is {len(doc[1])}")
            if len(doc[1]) > 25:
                # Use partition_text for direct content processing
                elements = partition_text(filename=file_path)
                element_dict = [el.to_dict() for el in elements]
                # Add file metadata to each element
                for element in element_dict:
                    #print(f"Partitioned element {element}")
                    chunk_words = set(re.findall(r'\b[a-zA-Z]{2,}\b', element['text']))

                    if 'metadata' not in element:
                        element['metadata'] = {}
                    element['metadata']['category'] = categorize_content(doc[1])
                    element['metadata']['unique_words_count'] = len(element['text'])
                    element['metadata']['unique_words'] = list(chunk_words)
                    element['metadata']['char_count'] = len(element)
                    element['metadata']['file_size'] = len(doc[1].encode('utf-8'))
                    element['metadata']['original_file_path'] = file_path
                
                all_elements.extend(element_dict)
                print(f"Extracted {len(element_dict)} elements from {doc[0]} and lenght is {len(doc[1])}")
        except ValueError as e:
            print(f"Error processing {doc[0]}: {e}")
        except Exception as e:
            print(f"Unexpected error: {e}")

    return all_elements

doc_element_results = process_partition_text(cleaned_training_docs)

Processing file: ../bluexp-dataset/legal-notices.adoc and lenght is 13
Processing file: ../bluexp-dataset/platform/aa_concepts.adoc and lenght is 2873
Extracted 21 elements from ../bluexp-dataset/platform/aa_concepts.adoc and lenght is 2873
Processing file: ../bluexp-dataset/platform/create_user_token.adoc and lenght is 3287
Extracted 124 elements from ../bluexp-dataset/platform/create_user_token.adoc and lenght is 3287
Processing file: ../bluexp-dataset/platform/http_details.adoc and lenght is 749
Extracted 24 elements from ../bluexp-dataset/platform/http_details.adoc and lenght is 749
Processing file: ../bluexp-dataset/platform/get_identifiers.adoc and lenght is 2194
Extracted 47 elements from ../bluexp-dataset/platform/get_identifiers.adoc and lenght is 2194
Processing file: ../bluexp-dataset/platform/workflows_tasks.adoc and lenght is 3921
Extracted 42 elements from ../bluexp-dataset/platform/workflows_tasks.adoc and lenght is 3921
Processing file: ../bluexp-dataset/platform/user_a

In [14]:
# Export cleaned_docs to JSON after unstructured processing
try:
    with open('cell_4_cleaned_unstructured_doc.json', 'w', encoding='utf-8') as f:
        json.dump(doc_element_results, f, indent=2, ensure_ascii=False)
    
    print(f"   ✅ Cleaned docs exported to 'cell_4_cleaned_unstructured_doc.json'")
    print(f"   📄 File size: {Path('cell_4_cleaned_unstructured_doc.json').stat().st_size / 1024:.1f} KB")
    print(f"   📊 Contains {len(doc_element_results)} cleaned documents")
    
except Exception as e:
    print(f"   ❌ Error exporting cleaned_docs: {e}")

   ✅ Cleaned docs exported to 'cell_4_cleaned_unstructured_doc.json'
   📄 File size: 55693.0 KB
   📊 Contains 86054 cleaned documents
