In [9]:
import re

def remove_code_blocks(markdown_text):
    """
    Removes code blocks from the given Markdown text.
    Code blocks are delimited by lines that start with ``` and end with ``` lines.
    """
    lines = markdown_text.split('\n')
    result_lines = []
    in_code_block = False

    for line in lines:
        # Check if line starts a code block
        if not in_code_block and line.strip().startswith('```'):
            # Enter code block mode
            in_code_block = True
            continue
        
        # Check if line ends a code block
        if in_code_block and line.strip().startswith('```'):
            # Exit code block mode
            in_code_block = False
            continue
        
        if not in_code_block:
            result_lines.append(line)

    return "\n".join(result_lines)

def extract_headings(markdown_text):
    """
    Extracts headings from Markdown text.
    Headings are lines starting with one or more # followed by a space.
    Returns a list of tuples (level, heading_text).
    """
    headings = []
    for line in markdown_text.split('\n'):
        # Strip leading/trailing whitespace
        stripped = line.strip()

        # Match a heading line: one or more # at start, a space, then the heading text
        match = re.match(r'^(#{1,6})\s+(.*)', stripped)
        if match:
            hashes = match.group(1)
            heading_text = match.group(2)
            level = len(hashes)
            headings.append(heading_text)
    return headings

In [10]:
with open('./FINAL.md', 'r', encoding='utf-8') as file:
    markdown_content = file.read()
    clean_markdown = remove_code_blocks(markdown_content)
    heading_list = extract_headings(clean_markdown)

In [11]:
heading_list

['Exploring the Foundations of ChatGPT: A Journey Through Language Models',
 "Understanding ChatGPT's Variability",
 'The Power of the Transformer Architecture',
 'Simplifying the Complex: A DIY Approach to Transformers',
 'Setting Up the Dataset',
 'Uncovering the Vocabulary',
 'Tokenizing the Text',
 'Building a Character-Level Language Model with the Tiny Shakespeare Dataset',
 'Data Acquisition and Preliminary Setup',
 'Understanding the Vocabulary',
 'Tokenization Strategy',
 'Data Preparation for Model Training',
 'Feeding Data into the Transformer Model',
 'Building a Character-Level Transformer Model for Language Processing',
 'Generating Training Examples',
 'Introducing the Batch Dimension',
 'Implementing a Bigram Language Model',
 'Evaluating the Model',
 'Character Prediction with Deep Learning using PyTorch',
 'Setting Up the Model for Character Prediction',
 'Understanding Logits and Predictions',
 'Evaluating Predictions with a Loss Function',
 'Generating Sequences',
 

In [13]:
import json
with open("headings.json", "w", encoding="utf-8") as f: 
    json.dump(heading_list, f, ensure_ascii=False, indent=4)