In [1]:
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
html = """
<table style="overflow:visible">
 <thead>
  <tr>
   <th>
    Property
   </th>
   <th>
    Type
   </th>
   <th>
    Description
   </th>
  </tr>
 </thead>
 <tbody>
  <tr>
   <td>
    <p>
     <code>
      methods
     </code>
    </p>
   </td>
   <td>
    <em>
     <p>
      array of type string
     </p>
    </em>
   </td>
   <td>
    <p>
     Names of the methods allowed
    </p>
   </td>
  </tr>
 </tbody>
</table>
"""

In [3]:
soup = BeautifulSoup(html, 'html.parser')

In [4]:
soup.table.attrs['class']

KeyError: 'class'

In [9]:
properties_list = []
for row in soup.css.select("table > tbody > tr"):
    cols = []
    for child in row.css.select("tr > td"):
        val = child.find("p").get_text().strip()
        cols.append(val)
    properties_list.append(
        {"name": cols[0], "type": cols[1], "description": cols[2]}
    )
print(properties_list)

[{'name': 'methods', 'type': 'array of type string', 'description': 'Names of the methods allowed'}]


In [25]:
html = """
<ul><li>When the newly selected <code>profile</code> is different than the active <code>profile</code>:<ul><li><code>Shelly.SetProfile</code> returns <code>profile_was</code>, followed by an automatic reboot</li></ul></li><li>When the newly selected <code>profile</code> is the same as the active <code>profile</code>:<ul><li><code>Shelly.SetProfile</code> returns <code>profile_was</code></li></ul></li></ul>
"""

In [26]:
soup = BeautifulSoup(html, 'html.parser')

In [32]:
str(soup.ul.get_text())

'When the newly selected profile is different than the active profile:Shelly.SetProfile returns profile_was, followed by an automatic rebootWhen the newly selected profile is the same as the active profile:Shelly.SetProfile returns profile_was'

In [None]:
for it in soup.css.select("ul > li").children:
    print(it.get_text())

In [57]:
soup.ul.name

'ul'

In [55]:
items = []
for li in soup.css.select("ul > li:not(li li)"):
    item = ''
    subitems = []
    for ch in li.children:
        if ch.name != 'ul':
            item += ch.get_text()
        else:
            subitems.append(ch.get_text())
    item += ' ' + '.'.join(subitems) + '.'
    items.append(item)
items = ' '.join(items)
print(items)

When the newly selected profile is different than the active profile: Shelly.SetProfile returns profile_was, followed by an automatic reboot. When the newly selected profile is the same as the active profile: Shelly.SetProfile returns profile_was.


In [56]:
soup.css.select("ul > li:not(li li)")

[<li>When the newly selected <code>profile</code> is different than the active <code>profile</code>:<ul><li><code>Shelly.SetProfile</code> returns <code>profile_was</code>, followed by an automatic reboot</li></ul></li>,
 <li>When the newly selected <code>profile</code> is the same as the active <code>profile</code>:<ul><li><code>Shelly.SetProfile</code> returns <code>profile_was</code></li></ul></li>]

In [7]:
# inplace
def init_function_dict(di):
    empty_dict = {
        "function_name": "",
        "description": "",
        "request_properties": None,
        "request_notes": "",
        "response_properties": None,
        "response_notes": "",
    }
    for k, v in empty_dict.items():
        di[k] = v
function_dict = {'description': 'This is a test'}
init_function_dict(function_dict)
print(function_dict)

{'description': '', 'function_name': '', 'request_properties': None, 'request_notes': '', 'response_properties': None, 'response_notes': ''}


In [3]:
from bs4 import BeautifulSoup
import pandas as pd

with open("index_shelly.html") as fp:
    html_doc = fp.read()

soup = BeautifulSoup(html_doc, "html.parser")

docs = soup.find_all("div", class_="theme-doc-markdown markdown")[0]

In [35]:
for child in docs.contents:
    print(child)

<h1>Shelly</h1>
<p>This service is common for all Gen2+ devices. It handles device management.
Here are listed all methods supported by the Shelly namespace.</p>
<h2 class="anchor anchorWithStickyNavbar_LWe7" id="shellygetstatus">Shelly.GetStatus<a aria-label="Direct link to Shelly.GetStatus" class="hash-link" href="#shellygetstatus" title="Direct link to Shelly.GetStatus">​</a></h2>
<p>This method returns the status of all the components of the device.</p>
<h4 class="anchor anchorWithStickyNavbar_LWe7" id="request">Request<a aria-label="Direct link to Request" class="hash-link" href="#request" title="Direct link to Request">​</a></h4>
<p>This method takes no parameters.</p>
<h4 class="anchor anchorWithStickyNavbar_LWe7" id="response">Response<a aria-label="Direct link to Response" class="hash-link" href="#response" title="Direct link to Response">​</a></h4>
<p>Attributes in the result are the status objects of all components in the device.</p>
<h2 class="anchor anchorWithStickyNavbar_

In [43]:
docs.contents[docs.contents.index(docs.find('h2'))]

<h2 class="anchor anchorWithStickyNavbar_LWe7" id="shellygetstatus">Shelly.GetStatus<a aria-label="Direct link to Shelly.GetStatus" class="hash-link" href="#shellygetstatus" title="Direct link to Shelly.GetStatus">​</a></h2>

In [46]:
docs.contents.index(docs.select('h2[id*="http-endpoint-"]')[0])

117

In [7]:
html = """<h4 class="anchor anchorWithStickyNavbar_LWe7" id="3-during-calibration-obstruction-detection-is-ignored-and-the-cover-will-not-stop-if-the-power-consumption-rises-above-the-obstruction-detection-threshold">3) During calibration, obstruction detection is ignored and the <code>Cover</code> will not stop if the power consumption rises above the obstruction detection threshold.<a href="#3-during-calibration-obstruction-detection-is-ignored-and-the-cover-will-not-stop-if-the-power-consumption-rises-above-the-obstruction-detection-threshold" class="hash-link" aria-label="Direct link to 3-during-calibration-obstruction-detection-is-ignored-and-the-cover-will-not-stop-if-the-power-consumption-rises-above-the-obstruction-detection-threshold" title="Direct link to 3-during-calibration-obstruction-detection-is-ignored-and-the-cover-will-not-stop-if-the-power-consumption-rises-above-the-obstruction-detection-threshold">​</a></h4>"""

In [8]:
soup = BeautifulSoup(html, "html.parser")

In [10]:
soup.h4.contents[0]

'3) During calibration, obstruction detection is ignored and the '

In [12]:
soup.h4.get_text().replace('\u200b', '')

'3) During calibration, obstruction detection is ignored and the Cover will not stop if the power consumption rises above the obstruction detection threshold.'