## Goal

- Here we will bring documentation to pages that reference other pages for documentation. <br />
For example: extractedData/v13.5.0/app/api-reference/components/index.mdx

- We will also remove comments present in the file and remove the irrelevant documentation(<>) part as well. <br />
For example: extractedData/v13.5.0/app/building-your-application/index.mdx

In [1]:
import os
import re

In [4]:
output_dir = "../docs"
# must be a directory in current folder and should not contain any os.path.sep symbols.
extracted_data = "extractedData"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)


def getOutputPath(path):
    return path.replace(
        extracted_data + os.path.sep, 
        output_dir + os.path.sep
    )


def getRouter(path):
    path_dirs = path.split(os.path.sep)
    if extracted_data in path_dirs:
        root_idx = path_dirs.index(extracted_data)
        # +1 is version and +2 is the first folder inside the root.
        return None if path_dirs[root_idx + 2] not in ["app", "pages"] else path_dirs[root_idx + 2]
    
    return path_dirs[0]


def getPathFromSource(curr_path, source_path):
    if source_path.startswith("'") and source_path.endswith("'"):
        source_path = source_path[1:-1]

    curr_path_folders = curr_path.split(os.path.sep)
    base_path = os.path.sep.join(curr_path_folders[:curr_path_folders.index(getRouter(curr_path))])
    abs_source_path = os.path.sep.join([base_path, source_path])

    if os.path.isdir(abs_source_path):
        abs_source_file = os.path.join(abs_source_path, "index.mdx")
        if not os.path.isdir(abs_source_file) and os.path.exists(abs_source_file):
            return abs_source_file
    elif not abs_source_path.endswith(".mdx"):
        return abs_source_path + ".mdx"

    return abs_source_path


def filterRouterContent(content, router):
    # Get the tag for current router.
    tag = "<AppOnly>" if router == "app" else "<PagesOnly>"
    remove_tag = "<AppOnly>" if tag == "<PagesOnly>" else "<PagesOnly>"

    remove_pattern = fr'{remove_tag}(.*?)<\/{remove_tag[1:]}'

    # Remove the content of other router.
    content = re.sub(remove_pattern, '', content, flags=re.DOTALL)

    # Remove opening and closing tags of the current router.
    content = re.sub(fr'{tag}', '', content, flags=re.DOTALL)
    content = re.sub(fr'<\/{tag[1:]}', '', content, flags=re.DOTALL)

    content = re.sub(r'\n{3,}', '\n', content, flags=re.DOTALL)

    return content


def processFile(path):
    content = ""
    with open(path, 'r', encoding='utf-8') as fp:
        content = fp.read()

    # See if there is a source mentioned for this document.
    lines = content.strip().split("\n")
    metadata_lines = lines[1:lines.index("---", 1)]
    source_index = next((i for i, s in enumerate(metadata_lines) if s.startswith("source:")), None)

    # If there is a source mentioned.
    if source_index is not None:
        source_path = metadata_lines[source_index].split(":")[-1].strip().replace("/", os.path.sep)
        try:
            source_path = getPathFromSource(path, source_path)
            with open(source_path, 'r', encoding='utf-8') as fp:
                content = fp.read()
            
            lines = content.strip().split("\n")
            metadata_lines = lines[1:lines.index("---", 1)]

        except Exception as e:
            print(e)

    # Get rid of comments
    content = re.sub(r'{\/\*.*?\*\/}', '', content, flags=re.DOTALL)
    # Get rid of 3 or more consecutive newline characters.
    content = re.sub(r'\n{3,}', '\n', content, flags=re.DOTALL)
    content = content.strip()

    # If no source is mentioned and file doesn't contain any content exclude it from indexing.
    if len(metadata_lines) + 2 == len(lines):
        return None
        
    router = getRouter(path)
    if router is not None:
        content = filterRouterContent(content, getRouter(path))

    return content


def traverseFolders(curr_path, indent = 0):
    for category in os.listdir(curr_path):
        path = os.path.join(curr_path, category)
        # print("\t" * indent, path)
        
        output_path = getOutputPath(path)
        if os.path.isdir(path):
            # Check if same output directory exists?
            if not os.path.isdir(output_path):
                os.mkdir(output_path)
            traverseFolders(path, indent + 1)
        else:
            try:
                content = processFile(path)
                if content is not None:
                    with open(output_path, 'w', encoding='utf-8') as fp:
                        fp.write(content)
            except Exception as e:
                print(f"Failed for file: {path}")
                print(f"Error: {e}")

In [5]:
for version in os.listdir(extracted_data):
    if not os.path.exists(os.path.join(output_dir, version)):
        os.mkdir(os.path.join(output_dir, version))

    # Process files and folder inside this
    traverseFolders(os.path.join(extracted_data, version))