## Goal

- Here we will bring documentation to pages that reference other pages for documentation. <br />
For example: extractedData/v13.5.0/app/api-reference/components/index.mdx

- We will also remove comments present in the file and remove the irrelevant documentation(<>) part as well. <br />
For example: extractedData/v13.5.0/app/building-your-application/index.mdx

In [2]:
import os
import re
import yaml
import pandas as pd
from typing import List

In [25]:
output_dir = "./processedDocs"
# must be a directory in current folder and should not contain any os.path.sep symbols.
extracted_data = "extractedData"
staging_data = "stagingData"

chunk_count = 0
# Keeps track of all the metadata_tags across all files
metadata_tags = set()

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

if not os.path.exists(staging_data):
    os.mkdir(staging_data)

In [4]:
def chunk_markdown(content: str) -> List[str]:
    heading_pattern = r"^(#{1,6})\s+(.+)$"
    
    def get_chunks(text, max_size, headings_stack):
        """Recursive function to chunk the markdown based on heading levels."""
        chunks = []
        lines = text.splitlines()
        buffer = []
        current_heading = ""
        in_code_block = False

        for line in lines:
            if line.strip().startswith("```"):
                in_code_block = not in_code_block

            if not in_code_block:
                heading_match = re.match(heading_pattern, line)
                if heading_match:
                    if buffer:
                        # Process the current buffer as a chunk
                        chunks += split_buffer(buffer, max_size, headings_stack)
                        buffer = []

                    current_heading = heading_match.group(0)  # Full heading with level
                    current_heading_level = len(heading_match.group(1))

                    # Only adjust the headings stack up to the current heading level
                    while (
                        len(headings_stack) > 0 and 
                        current_heading_level <= len(re.match(heading_pattern, headings_stack[-1]).group(1))
                    ):
                        headings_stack = headings_stack[:-1]

                    headings_stack.append(current_heading)
                    continue

            buffer.append(line)

        if buffer:
            # Process remaining buffer
            chunks += split_buffer(buffer, max_size, headings_stack)

        return chunks

    def split_buffer(buffer, max_size, headings_stack):
        """Split buffer content into chunks if it exceeds the size."""
        chunks = []
        content = "\n".join(buffer)
        words = content.split()

        if len(words) <= max_size:
            chunk_content = "\n".join(headings_stack).strip() + "\n" + content
            # Skip chunks that are just headings
            if not all(re.match(heading_pattern, line) for line in content.splitlines()):
                chunks.append(chunk_content)
        else:
            current_chunk = []
            word_count = 0

            for word in words:
                current_chunk.append(word)
                word_count += 1
                if word_count >= max_size:
                    # Ensure we don't split in the middle of a code block or list
                    joined_chunk = " ".join(current_chunk)
                    if re.search(r"```", joined_chunk) and joined_chunk.count("```") % 2 != 0:
                        continue
                    if re.search(r"\n\s*[-*]\s", joined_chunk):
                        continue

                    chunk_content = "\n".join(headings_stack).strip() + "\n" + " ".join(current_chunk)
                    chunks.append(chunk_content)
                    current_chunk = []
                    word_count = 0

            if current_chunk:
                chunk_content = "\n".join(headings_stack).strip() + "\n" + " ".join(current_chunk)
                chunks.append(chunk_content)

        return chunks

    if not content.strip():
        return []

    return get_chunks(content, 385, [])


In [38]:
def getOutputPath(path):
    return path.replace(
        extracted_data + os.path.sep, 
        output_dir + os.path.sep
    )


def getRouter(path):
    path_dirs = path.split(os.path.sep)
    if extracted_data in path_dirs:
        root_idx = path_dirs.index(extracted_data)
        # +1 is version and +2 is the first folder inside the root.
        return None if path_dirs[root_idx + 2] not in ["app", "pages"] else path_dirs[root_idx + 2]
    
    return path_dirs[0]


def getPathFromSource(curr_path, source_path):
    if source_path.startswith("'") and source_path.endswith("'"):
        source_path = source_path[1:-1]

    curr_path_folders = curr_path.split(os.path.sep)
    base_path = os.path.sep.join(curr_path_folders[:curr_path_folders.index(getRouter(curr_path))])
    abs_source_path = os.path.sep.join([base_path, source_path])

    if os.path.isdir(abs_source_path):
        abs_source_file = os.path.join(abs_source_path, "index.mdx")
        if not os.path.isdir(abs_source_file) and os.path.exists(abs_source_file):
            return abs_source_file
    elif not abs_source_path.endswith(".mdx"):
        return abs_source_path + ".mdx"

    return abs_source_path


def filterRouterContent(content, router):
    # Get the tag for current router.
    tag = "<AppOnly>" if router == "app" else "<PagesOnly>"
    remove_tag = "<AppOnly>" if tag == "<PagesOnly>" else "<PagesOnly>"

    remove_pattern = fr'{remove_tag}(.*?)<\/{remove_tag[1:]}'

    # Remove the content of other router.
    content = re.sub(remove_pattern, '', content, flags=re.DOTALL)

    # Remove opening and closing tags of the current router.
    content = re.sub(fr'{tag}', '', content, flags=re.DOTALL)
    content = re.sub(fr'<\/{tag[1:]}', '', content, flags=re.DOTALL)

    content = re.sub(r'\n{3,}', '\n', content, flags=re.DOTALL)

    return content


def processFile(path):
    global metadata_tags
    content = ""
    with open(path, 'r', encoding='utf-8') as fp:
        content = fp.read()

    # See if there is a source mentioned for this document.
    lines = content.strip().split("\n")
    metadata_lines = lines[1:lines.index("---", 1)]
    source_index = next((i for i, s in enumerate(metadata_lines) if s.startswith("source:")), None)

    # If there is a source mentioned.
    if source_index is not None:
        source_path = metadata_lines[source_index].split(":")[-1].strip().replace("/", os.path.sep)
        try:
            source_path = getPathFromSource(path, source_path)
            with open(source_path, 'r', encoding='utf-8') as fp:
                content = fp.read()
            
            lines = content.strip().split("\n")
            metadata_lines = lines[1:lines.index("---", 1)]

        except Exception as e:
            print(e)

    metadata_dict = yaml.safe_load("\n".join(metadata_lines))
    metadata_tags = metadata_tags.union(set(metadata_dict.keys()))

    # Remove metadata tag we have everything we need.
    content = "\n".join(content.split("\n")[len(metadata_lines) + 2:])

    # Get rid of comments
    content = re.sub(r'{\/\*.*?\*\/}', '', content, flags=re.DOTALL)
    # Get rid of 3 or more consecutive newline characters.
    content = re.sub(r'\n{3,}', '\n', content, flags=re.DOTALL)
    content = content.strip()

    # If no source is mentioned and file doesn't contain any content exclude it from indexing.
    if len(metadata_lines) + 2 == len(lines):
        return None, None, None
        
    router = getRouter(path)
    if router is not None:
        content = filterRouterContent(content, getRouter(path))

    return metadata_dict.get("title", ""), metadata_dict.get("description", ""), content.strip()


def traverseFolders(curr_path, df, version, indent = 0):
    global chunk_count
    for category in os.listdir(curr_path):
        path = os.path.join(curr_path, category)
        # print("\t" * indent + path)
        
        output_path = getOutputPath(path)
        if os.path.isdir(path):
            # Check if same output directory exists?
            if not os.path.isdir(output_path):
                os.mkdir(output_path)
            df = traverseFolders(path, df, version, indent + 1)
        else:
            try:
                title, description, content = processFile(path)
                
                # # We saw that the content with single line is not much useful for us.
                # if content is not None and len(content.split("\n")) > 1:
                if content is not None:
                    # Will be helpful for next step when chunking
                    if not content.startswith("#"):
                        content = f"# {title}\n\n" + content

                    with open(output_path, 'w', encoding='utf-8') as fp:
                        fp.write(content)

                    chunks = chunk_markdown(content)
                    num_chunks = len(chunks)
                    chunk_count += num_chunks
                    chunks_df = pd.DataFrame({
                        'path': [path[len(extracted_data) + len(version) + 2:]] * num_chunks, 
                        'title': [title] * num_chunks,
                        'description': [description] * num_chunks,
                        'content': chunks
                    })

                    df = pd.concat([df, chunks_df], ignore_index=True)
            except Exception as e:
                print(f"Failed for file: {path}")
                print(f"Error: {e}")

    return df


In [48]:
for version in os.listdir(extracted_data):
    chunk_count = 0
    if not os.path.exists(os.path.join(output_dir, version)):
        os.mkdir(os.path.join(output_dir, version))

    print("For version:", version)
    df = pd.DataFrame(columns=['path', 'title', 'description', 'content'])
    df = traverseFolders(os.path.join(extracted_data, version), df, version)
    df['version'] = version
    df.to_csv(f"{staging_data}/{version[1:]}.csv", index=False)
    df = pd.read_csv(f"{staging_data}/{version[1:]}.csv")

    df['length'] = df['content'].apply(lambda x: len(x.split()))
    print(f"Min Content Length: {min(df['length'])}, Max Content Length: {max(df['length'])}")
    print(f"Exceeding Chunk Size: {df[df['length'] > 385].shape[0]}/{df.shape[0]}")
    print(f"Files processed: {df.shape}, Chunk Count: {chunk_count}")
    print("-" * 50)

For version: v15.1.0
Min Content Length: 7, Max Content Length: 461
Exceeding Chunk Size: 60/2719
Files processed: (2719, 6), Chunk Count: 2719
--------------------------------------------------
For version: v15.0.0
Min Content Length: 7, Max Content Length: 461
Exceeding Chunk Size: 60/2594
Files processed: (2594, 6), Chunk Count: 2594
--------------------------------------------------
For version: v14.0.0
Min Content Length: 7, Max Content Length: 455
Exceeding Chunk Size: 39/1978
Files processed: (1978, 6), Chunk Count: 1978
--------------------------------------------------
For version: v14.2.0
Min Content Length: 7, Max Content Length: 456
Exceeding Chunk Size: 46/2368
Files processed: (2368, 6), Chunk Count: 2368
--------------------------------------------------
For version: v14.1.0
Min Content Length: 7, Max Content Length: 455
Exceeding Chunk Size: 41/2185
Files processed: (2185, 6), Chunk Count: 2185
--------------------------------------------------
For version: v13.5.0
Min 

In [17]:
metadata_tags

{'description', 'nav_title', 'related', 'title', 'version'}

In [47]:
content = ""
with open("./processedDocs/v14.0.0/index.mdx", 'r') as fp:
    content = fp.read()

chunks = chunk_markdown(content)
for chunk in chunks:
    print(chunk)
    print("-" * 80)

# Introduction

Welcome to the Next.js documentation!

--------------------------------------------------------------------------------
# Introduction
## What is Next.js?

Next.js is a React framework for building full-stack web applications. You use React Components to build user interfaces, and Next.js for additional features and optimizations.

Under the hood, Next.js also abstracts and automatically configures tooling needed for React, like bundling, compiling, and more. This allows you to focus on building your application instead of spending time with configuration.

Whether you're an individual developer or part of a larger team, Next.js can help you build interactive, dynamic, and fast React applications.

--------------------------------------------------------------------------------
# Introduction
## Main Features

Some of the main Next.js features include:

| Feature                                                                  | Description                              