# Custom HTML/XML Parser
This notebook implements a simple HTML/XML parser that can handle basic tags, attributes, and nested elements.

In [2]:
class Node:
    def __init__(self, tag=None, value=None, attributes=None, parent=None):
        self.tag = tag
        self.value = value
        self.attributes = attributes or {}
        self.children = []
        self.parent = parent

    def add_child(self, child):
        self.children.append(child)
        child.parent = self

    def __str__(self, level=0):
        indent = "  " * level
        result = indent
        if self.tag:
            result += f"<{self.tag}"
            if self.attributes:
                attrs = " ".join(f'{k}="{v}"' for k, v in self.attributes.items())
                result += " " + attrs
            result += ">"
        if self.value:
            result += self.value
        result += "\n"
        for child in self.children:
            result += child.__str__(level + 1)
        if self.tag:
            result += indent + f"</{self.tag}>\n"
        return result

In [7]:
class Parser:
    def __init__(self):
        self.root = Node()
        self.current = self.root

    def parse(self, text):
        i = 0
        while i < len(text):
            if text[i] == '<':
                if text[i + 1] == '/':
                    # Closing tag
                    end = text.find('>', i)
                    if end != -1:
                        self.current = self.current.parent
                        i = end + 1
                else:
                    # Opening tag
                    end = text.find('>', i)
                    if end != -1:
                        tag_content = text[i+1:end]
                        tag_parts = tag_content.split()
                        tag_name = tag_parts[0]

                        # Parse attributes
                        attributes = {}
                        for part in tag_parts[1:]:
                            if '=' in part:
                                key, value = part.split('=', 1)
                                value = value.strip('"\'')
                                attributes[key] = value

                        new_node = Node(tag=tag_name, attributes=attributes)
                        self.current.add_child(new_node)
                        self.current = new_node
                        i = end + 1
            else:
                # Text content
                next_tag = text.find('<', i)
                if next_tag == -1:
                    next_tag = len(text)
                content = text[i:next_tag].strip()
                if content:
                    text_node = Node(value=content)
                    self.current.add_child(text_node)
                i = next_tag

        return self.root

In [8]:
# Test the parser
test_html = """
<html>
    <head>
        <title>Test Page</title>
    </head>
    <body class="main" id="content">
        <h1>Hello World</h1>
        <p>This is a <b>test</b> paragraph.</p>
    </body>
</html>
"""

parser = Parser()
result = parser.parse(test_html)
print(result)


  <html>
    <head>
      <title>
        Test Page
      </title>
    </head>
    <body class="main" id="content">
      <h1>
        Hello World
      </h1>
      <p>
        This is a
        <b>
          test
        </b>
        paragraph.
      </p>
    </body>
  </html>



The parser above implements:
1. A `Node` class to represent HTML/XML elements with tags, attributes, values, and child nodes
2. A `Parser` class that processes HTML/XML text and builds a tree structure
3. Support for:
   - Opening and closing tags
   - Text content
   - Tag attributes
   - Nested elements

Limitations:
- Doesn't handle self-closing tags
- No support for comments
- No validation of tag matching
- Basic attribute parsing

You can extend this parser by adding support for these features as needed.