In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
from pathlib import Path

from parse_utils import init_function_dict, parse_table, parse_list, parse_note, parse_code

# functions

In [18]:
docs_dir = Path('data/docs/')

In [19]:
def parse_methods(
    docs, methods_start_idx, methods_stop_idx, tag_element_map, ignore_functions=[]
):
    functions_df = pd.DataFrame(columns=["name", "description"])

    function_dict = {}
    init_function_dict(function_dict)

    for child in docs.contents[methods_start_idx:methods_stop_idx]:
        if child.name == tag_element_map["function_name"]:
            if function_dict["name"] != "":
                if function_dict["name"] not in ignore_functions:
                    functions_df.loc[len(functions_df)] = function_dict
                init_function_dict(function_dict)

            function_name = str(child.contents[0])
            function_dict["name"] = function_name

        elif child.name == "p":
            function_dict[f"description"] += child.get_text() + "\n"

        elif child.name == "ul":
            function_dict[f"description"] += parse_list(child) + "\n"

        elif child.name == "h4":
            function_dict[f"description"] += (
                child.get_text().replace("\u200b", "") + "\n"
            )

        elif child.name == "table":
            function_dict[f"description"] += parse_table(child) + "\n"

        elif child.name == "div" and 'alert' in child.attrs['class']:
            function_dict["description"] += parse_note(child) + "\n"

        elif child.name == "div" and 'language-json' in child.attrs['class']:
            function_dict["description"] += parse_code(child) + "\n"

        else:
            with open("other_tags.html", "a") as f:
                f.write("IN FUNCTIONS:\n")
                f.write(child.prettify())
    functions_df.loc[len(functions_df)] = function_dict

    device_name = functions_df.iloc[0]['name'].split(".")[0]
    functions_df.to_csv(docs_dir / f"methods.csv", index=False, header=False, mode='a')

    for _, row in functions_df.iterrows():
        with open(docs_dir / f'methods/{row["name"]}.md', "w") as f:
            f.write(f'Method name: {row["name"]}\n')
            f.write(f'Method description: {row["description"]}')


def parse_component(docs, component_start_idx, component_stop_idx):
    component_dict = {
        "name": "",
        "type": "",
        "description": "",
    }
    for child in docs.contents[component_start_idx:component_stop_idx]:
        if child.name == "h1":
            component_dict["name"] = child.get_text()

        elif child.name == "p":
            component_dict["description"] += child.get_text() + "\n"

            if "service" in component_dict["description"]:
                component_dict["type"] = "service"
            else:
                component_dict["type"] = "component"

        elif child.name == "ul":
            component_dict["description"] += parse_list(child) + "\n"

        else:
            with open("other_tags.html", "a") as f:
                f.write("IN COMPONENT:\n")
                f.write(child.prettify())
                
    with open(docs_dir / f'components/{component_dict["name"]}.md', "w") as f:
        f.write(f'{component_dict["name"]} {component_dict["type"]} description: {component_dict["description"]}\n')

def parse_config(config_contents, device_name):
    device_name = device_name.capitalize()
    config_description = ""
    for child in config_contents:
        if child.name == "p":
            config_description += child.get_text() + "\n"

        elif child.name == "ul":
            config_description += parse_list(child) + "\n"

        elif child.name == "h4":
            config_description += (
                child.get_text().replace("\u200b", "") + "\n"
            )

        elif child.name == "table":
            config_description += parse_table(child) + "\n"

        elif child.name == "div" and 'alert' in child.attrs['class']:
            config_description += parse_note(child) + "\n"

        elif child.name == "div" and 'language-json' in child.attrs['class']:
            config_description += parse_code(child) + "\n"

        else:
            with open("other_tags.html", "a") as f:
                f.write("IN CONFIG:\n")
                f.write(child.prettify())

    with open(docs_dir / f'additional/{device_name}.Config.md', "w") as f:
        f.write(f'{device_name} config object description: {config_description}')

def parse_status(status_contents, device_name):
    device_name = device_name.capitalize()
    description = ""
    for child in status_contents:
        if child.name == "p":
            description += child.get_text() + "\n"

        elif child.name == "ul":
            description += parse_list(child) + "\n"

        elif child.name == "h4":
            description += (
                child.get_text().replace("\u200b", "") + "\n"
            )

        elif child.name == "table":
            description += parse_table(child) + "\n"

        elif child.name == "div" and 'alert' in child.attrs['class']:
            description += parse_note(child) + "\n"

        elif child.name == "div" and 'language-json' in child.attrs['class']:
            description += parse_code(child) + "\n"

        else:
            with open("other_tags.html", "a") as f:
                f.write("IN STATUS:\n")
                f.write(child.prettify())

    with open(docs_dir / f'additional/{device_name}.Status.md', "w") as f:
        f.write(f'{device_name} status object description: {description}')

# copy status to GetStatus and config to GetConfig
def copy_config(device_name):
    device_name = device_name.capitalize()
    with open(docs_dir / f'additional/{device_name}.Config.md', "r") as f:
        config = f.read()
    with open(docs_dir / f'methods/{device_name}.GetConfig.md', "a") as f:
        f.write(config)
    with open(docs_dir / f'methods/{device_name}.SetConfig.md', "a") as f:
        f.write(config)
    # os.remove(docs_dir / f'{device_name}.Config.md')
        
def copy_status(device_name):
    device_name = device_name.capitalize()
    with open(docs_dir / f'additional/{device_name}.Status.md', "r") as f:
        status = f.read()
    with open(docs_dir / f'methods/{device_name}.GetStatus.md', "a") as f:
        f.write(status)

In [20]:
TAG_ELEMENT_MAP = {"function_name": "h3"}

# cover

In [105]:
device_name = "cover"

with open(f"index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.select('h2[id*="http-endpoint-"]')[0])
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# switch

In [110]:
device_name = "switch"

with open(f"index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.select('h2[id*="http-endpoint-"]')[0])
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# light

In [111]:
device_name = "light"

with open(f"index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.select('h2[id*="http-endpoint-"]')[0])
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# input

In [115]:
input_ignore_functions = ['Input.CheckExpression']

device_name = "input"

with open(f"index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.contents[methods_start_idx-1].find_next_sibling('h2'))
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP, input_ignore_functions)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

Unknown table type


# temperature

In [116]:
device_name = "temperature"

with open(f"index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.contents[methods_start_idx-1].find_next_sibling('h2'))
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# smoke

In [14]:
device_name = "smoke"

with open(f"data/html/index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.contents[methods_start_idx-1].find_next_sibling('h2'))
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# humidity

In [21]:
device_name = "humidity"

with open(f"data/html/index_{device_name}.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

component_start_idx = 0
component_stop_idx = docs.contents.index(docs.find("h2"))
parse_component(docs, component_start_idx, component_stop_idx)

methods_start_idx = docs.contents.index(docs.find("h2")) + 1
methods_stop_idx = docs.contents.index(docs.contents[methods_start_idx-1].find_next_sibling('h2'))
parse_methods(docs, methods_start_idx, methods_stop_idx, TAG_ELEMENT_MAP)

config_start_idx = docs.contents.index(docs.select('h2[id="configuration"]')[0])+1
config_stop_idx = docs.contents.index(docs.select('h2[id="status"]')[0])
parse_config(docs.contents[config_start_idx:config_stop_idx], device_name)

status_start_idx = docs.contents.index(docs.select('h2[id="status"]')[0])+1
status_stop_idx = docs.contents.index(docs.select('h2[id="webhook-events"]')[0])
parse_status(docs.contents[status_start_idx:status_stop_idx], device_name)

# config/status copy

In [9]:
devices = ["cover", "switch", "light", "input", "temperature", "smoke", "humidity"]
devices = [device.capitalize() for device in devices]

In [11]:
import shutil

In [22]:
for device in ['humidity']:
    copy_config(device)
    copy_status(device)
    # shutil.move(f'docs/{device}.Config.md', f'additional/{device}.Config.md')
    # shutil.move(f'docs/{device}.Status.md', f'additional/{device}.Status.md')