In [1]:
import os
import json
from bs4 import BeautifulSoup, Tag
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from markdownify import markdownify as html_to_md

In [2]:
with open("/Users/liao/myProjects/VSCode_workspace/rockbook/resources/crags_urls.json", "r") as f:
    info_list = json.load(f)

print(len(info_list))
print(info_list[0].keys())

31
dict_keys(['title', 'href', 'data_nid', 'data_subtype', 'children'])


In [3]:
@dataclass
class CragNode:
    title: str
    href: str = ""
    data_nid: str = ""
    data_subtype: str = ""
    children: List["CragNode"] = field(default_factory=list)
    info_dict: Optional[Dict[str, Any]] = field(default_factory=dict)

    @staticmethod
    def from_dict(d: Dict[str, Any]) -> "CragNode":
        return CragNode(
            title=d["title"],
            href=d["href"],
            data_nid=d["data_nid"],
            data_subtype=d["data_subtype"],
            children=[CragNode.from_dict(c) for c in d.get("children", [])],
        )

    def __repr__(self) -> str:
        return f"CragNode(title={self.title}, href={self.href}, data_nid={self.data_nid}, data_subtype={self.data_subtype}, children_count={len(self.children)}, info_dict={self.info_dict})"

    def __str__(self) -> str:
        return self.__repr__()

    def __eq__(self, other: object) -> bool:
        if isinstance(other, str):
            return self.data_nid == other
        return self.data_nid == other.data_nid

    def __hash__(self) -> int:
        return hash(self.data_nid)

In [4]:
root = {"title": "China", "href": "", "data_nid": "000", "data_subtype": "", "children": info_list}
root_node = CragNode.from_dict(root)

crag_node_map = {}
child_to_parent_map = {}


def traverse(node: CragNode):
    crag_node_map[node.data_nid] = node
    crag_node_map[node.href] = node
    for child in node.children:
        traverse(child)
        child_to_parent_map[child.data_nid] = node.data_nid


traverse(root_node)
print(len(crag_node_map))

4052


In [5]:
with open("/Users/liao/myProjects/VSCode_workspace/rockbook/resources/thecrag1/thecrag.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

print(len(data))
print(data[0].keys())
html = data[0]["result"]

347
dict_keys(['input', 'result'])


In [6]:
# /Users/liao/Desktop/thecrag/精灵谷 Jinglinggu, 攀登 _ theCrag.html
# /Users/liao/Desktop/thecrag/A区, 运动攀登 | theCrag.html
# /Users/liao/myProjects/VSCode_workspace/rockbook/resources/thecrag3/4895530659.html
with open("/Users/liao/myProjects/VSCode_workspace/rockbook/resources/thecrag3/4895530659.html", "r") as f:
    html = f.read()

soup = BeautifulSoup(html, "lxml")
data_nid = soup.find("body").get("data-nid")
target_crag_node = crag_node_map.get(data_nid)

main_contain = soup.select_one("#wrapper div.regions__content div.regions__read")

info_nodes = main_contain.select("div.regions__read div.node-info")
for info_node in info_nodes:
    # 标题
    ele_h2 = info_node.select_one("h2")
    title = "".join(t.strip() for t in ele_h2.find_all(string=True, recursive=False) if t.strip())
    print(f"Info Title: {title}")

    # 内容
    ele_content = info_node.select_one("div.content div.markdown")
    print(f"Info Content (HTML):\n{ele_content}\n")
    if ele_content:
        md_content = html_to_md(str(ele_content), heading_style="ATX")
        print(f"Info Content (Markdown):\n{md_content}\n")
        target_crag_node.info_dict[title] = {"content": md_content}

    # 内容取自
    ele_small_a = ele_h2.select_one("small.from a")
    if ele_small_a:
        content_from_href = ele_small_a.get("href", "")
        ref_node = crag_node_map[content_from_href]
        print(f"Content From: {ref_node.title} ({content_from_href})")
        target_crag_node.info_dict[title]["ref_id"] = ref_node.data_nid


# sub_info_tables = main_contain.select_one("div.regions__read form")
# sub_info_nodes = sub_info_tables.select("div[data-nid]")

# print(sub_info_nodes[0].get("class", []))

# print(len(sub_info_nodes))  # 子 div 数量
# sub_info_nodes[0]

Info Title: 地图
Info Content (HTML):
None

Info Title: 总结
Info Content (HTML):
<div class="markdown"><p>路书Guidebook: <a href="http://www.itinerantclimberscollective.com/s/WildWestChina_GuidebookFinal_Nov2015_9.pdf">Wild West China Exploration by Ryder Stroud</a></p>
</div>

Info Content (Markdown):
路书Guidebook: [Wild West China Exploration by Ryder Stroud](http://www.itinerantclimberscollective.com/s/WildWestChina_GuidebookFinal_Nov2015_9.pdf)

Info Title: 描述
Info Content (HTML):
<div class="markdown"><p>Walk down from back side for all routes.</p>
</div>

Info Content (Markdown):
Walk down from back side for all routes.

Info Title: 标签
Info Content (HTML):
None

Info Title: 规划您的旅程
Info Content (HTML):
None



In [7]:
target_crag_node

CragNode(title=Jedi Tower, href=/zh_hans/climbing/china/area/4895530659, data_nid=4895530659, data_subtype=crag, children_count=0, info_dict={'总结': {'content': '路书Guidebook: [Wild West China Exploration by Ryder Stroud](http://www.itinerantclimberscollective.com/s/WildWestChina_GuidebookFinal_Nov2015_9.pdf)'}, '描述': {'content': 'Walk down from back side for all routes.'}})