In [26]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [32]:
from IPython.display import display
from IPython.display import clear_output
import json
!pip install nltk

import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')

clear_output()


In [28]:
# Cell 2 Extract documentation texts
def extract_texts(repo_path, extensions=("adoc", "json", "yaml", "yml")):
    """
    Extract texts from files with specified extensions in the given repository path.
    """
    texts = []
    
    # Fix: Process all extensions, not just the first one
    for ext in extensions:
        file_paths = list(Path(repo_path).rglob(f"*.{ext}"))
        print(f"Found {len(file_paths)} .{ext} files")
        
        for path in file_paths:
            try:
                with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                    content = f.read()
                    if content.strip():  # Only add non-empty files
                        texts.append((str(path), content, ext))
            except Exception as e:
                print(f"Error reading {path}: {e}")
                continue
    
    return texts

def extract_adoc_texts(repo_path):
    """
    Extract texts specifically from .adoc files in the given repository path.
    """
    adoc_texts = []
    file_paths = list(Path(repo_path).rglob("*.adoc"))
    print(f"Found {len(file_paths)} .adoc files")
    
    for path in file_paths:
        try:
            with open(path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
                if content.strip():  # Only add non-empty files
                    adoc_texts.append((str(path), content, 'adoc'))
        except Exception as e:
            print(f"Error reading {path}: {e}")
            continue
    
    return adoc_texts

def docs_to_json(docs, filename):
    # Export extracted docs to a JSON file
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(docs, f, indent=2, ensure_ascii=False)
        print(f"✅ Extracted docs exported to '{filename}.json'")
    except Exception as e:
        print(f"❌ Error exporting extracted '{filename}.json' docs: {e}")

docs = extract_texts("../data/bluexp-dataset")
print(f"Extracted {len(docs)} files")

#docs_to_json(docs, "cell_2_extracted_docs")
docs_to_json( extract_adoc_texts("../data/bluexp-dataset"), "cell_2_extracted_adoc_docs")



Found 158 .adoc files
Found 16 .json files
Found 1 .yaml files
Found 18 .yml files
Extracted 193 files
Found 158 .adoc files
✅ Extracted docs exported to 'cell_2_extracted_adoc_docs.json'


## Cleaning and Preparation


In [38]:
# Cell 3 metadata 

import os
from datetime import datetime
import re


# Extract section headings
def extract_section_headings(content):
    """
    Extracts section headings from the given content based on a specific pattern.

    This function identifies lines in the content that start with one or more 
    '=' characters followed by a space and a title. It determines the level of 
    the heading based on the number of '=' characters and returns a list of 
    formatted headings.

    # AsciiDoc Heading Syntax
    = Document Title → Level 0 (already extracted as title)

    == Section A → Level 1

    === Subsection A.1 → Level 2

    ==== Sub-subsection A.1.1 → Level 3

    Args:
        content (str): The input text content from which section headings 
                       are to be extracted.

    Returns:
        list: A list of strings representing the extracted section headings, 
              formatted as "Level {level}: {title}" where {level} is the 
              number of '=' characters and {title} is the heading text.
    """
    headings = []
    for line in content.splitlines():
        match = re.match(r"^(==+)\s+(.*)", line.strip())
        if match:
            level = len(match.group(1))  # Count of '=' characters
            title = match.group(2).strip()
            headings.append(f"Level {level}: {title}")
    return headings

def get_last_modified_datetime(file_path):
    """
    Get the last modified datetime of a file.
    """
    try:
        last_modified_timestamp = os.path.getmtime(file_path)
        return datetime.fromtimestamp(last_modified_timestamp)
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error retrieving last modified datetime for {file_path}: {e}")
        return None

# Synonym expansion using WordNet
def get_synonyms(word):
    synonyms = []
    seen = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonym = lemma.name().replace('_', ' ').lower()
            if (
                synonym != word.lower()
                and synonym not in seen
                and len(synonym) > 2
            ):
                seen.add(synonym)
                synonyms.append(synonym)
            if len(synonyms) == 3:
                return synonyms
    return synonyms


def generate_synonym_data(title, keywords, section_headings):
    """
    Best fit sources for synonym_data extraction:

    keywords: main base
    title: most direct paraphrasable noun-verb pair
    section_headings: reinforce and diversify search terms
    """
    base_terms = set(keywords + [title] + section_headings)
    enriched = []
    for term in base_terms:
        # naive synonym example
        if "create" in term.lower():
            enriched.append("generate")
        if "token" in term.lower():
            enriched.append("access key")
        # ... add more rules
    return list(set(enriched))


def extract_metedata_from_docs(docs):
    """
    Extract metadata from the documentation texts.
    """
    metadata_list = []
    for entry in docs:
        path, content, fmt = entry
        metadata = {}
        metadata["file_path"] = path
        metadata["format"] = fmt
        # Extract title from content
        lines = content.splitlines()
        title = next((line for line in lines if line.startswith("= ")), "").replace("= ", "")
        metadata["title"] = title.strip()

        # Extract UUID
        uuid_line = next((line for line in lines if "uuid:" in line), "")
        metadata["uuid"] = uuid_line.split("uuid:")[-1].strip() if "uuid:" in uuid_line else None

        # Extract summary
        summary_line = next((line for line in lines if "summary:" in line), "")
        metadata["summary"] = summary_line.split("summary:")[-1].strip() if "summary:" in summary_line else None

        # Extract keywords
        keywords_line = next((line for line in lines if "keywords:" in line), "")
        metadata["keywords"] = [kw.strip() for kw in keywords_line.split("keywords:")[-1].split(",")] if "keywords:" in keywords_line else []   
        
        # Add last modified timestamp (convert datetime to string)
        last_modified = get_last_modified_datetime(path)
        metadata["last_modified"] = last_modified.isoformat() if last_modified else None
        metadata["section_headings"] = extract_section_headings(content)

        # Collect unique terms from title, keywords, and section headings
        conjunctions = {"and", "or", "so", "for"}
        base_terms = set(word.lower() for word in metadata["title"].split() if len(word) > 2 and word.lower() not in conjunctions)
        base_terms.update([kw.strip().lower() for kw in metadata["keywords"] if len(kw.strip()) > 2 and kw.strip().lower() not in conjunctions])
        for heading in metadata["section_headings"]:
            words = re.findall(r"\b\w+\b", heading.lower())
            base_terms.update(word for word in words if len(word) > 2 and word not in conjunctions)
        # Create synonym data dictionary
        synonym_data = {term: get_synonyms(term) for term in base_terms if term.isalpha()}
        metadata["synonym_data"] = synonym_data
        
        metadata_list.append(metadata)
    
    return metadata_list
# Use the existing 'docs' variable from Cell 2
docs2 = docs


metadata_results = extract_metedata_from_docs(docs2)
docs_to_json(metadata_results, "cell_3_extracted_metadata.json")

✅ Extracted docs exported to 'cell_3_extracted_metadata.json.json'
