-
Notifications
You must be signed in to change notification settings - Fork 92
/
Copy pathmd.py
145 lines (115 loc) · 4.59 KB
/
md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import os
import re
import itertools
from dataclasses import dataclass, field
import markdown
import bs4
def BeautifulSoup(*args):
return bs4.BeautifulSoup(*args, features='lxml')
@dataclass
class markdown_section:
level : int
heading : str
content : str
first_node_content : str
children : list = field(default_factory=list)
def parse_document(*, fn=None, data=None, linesep="", as_text=True):
if fn:
data = open(fn, encoding="utf-8").read()
else:
assert data
soup = BeautifulSoup(
markdown.markdown(data,
extensions=['tables', 'fenced_code', 'sane_lists'])
)
if not soup.h1:
# In a recent PR most of the top-level headings were removed
# from the Markdown documents, but the code here still relies
# on them for building the document tree, because parsing
# happens based on the various headings and their number.
if soup.body is None:
return markdown_section(1, '', '', '', [])
first = next(soup.body.children)
first.insert_before(soup.new_tag('h1', 'DocumentRoot'))
headings = soup.find_all(re.compile("h\d"))
next_heading = headings[1:] + [None]
root = None
stack = [None]
for h, next_h in zip(headings, next_heading):
nodes = (n for n in h.nextSiblingGenerator())
selected = itertools.takewhile(lambda n: next_h is None or n != next_h, nodes)
concat = ""
first = ""
if as_text:
strings = list(filter(None, map(lambda n: getattr(n, 'text', '').strip(), selected)))
concat = linesep.join(strings)
first = strings[0] if strings else ""
else:
selected = list(selected)
if selected:
concat = "".join(map(str, selected))
first = str(selected[0])
section = markdown_section(int(h.name[1:]), h.text, concat, first)
if section.level == len(stack):
stack.append(None)
elif section.level == len(stack) - 1:
pass
else:
stack[section.level:] = [None]
stack[-1] = section
try:
if stack[-2] is not None:
stack[-2].children.append(section)
except:
print(fn)
return None
if root is None:
root = section
return root
class markdown_attribute_parser:
def __init__(self, *, fn=None, data=None, as_text=True, heading_name="Attributes", short=False, linesep=""):
self.heading_name = heading_name
self.root = parse_document(fn=fn, data=data, as_text=as_text, linesep=linesep)
self.children = {}
self.status = {}
self.short = short
def definition(self, short=False):
if self.root is None:
self.status["DEFINITION"] = ("NO_CONTENT", -1)
return
return self.root.first_node_content if short else self.root.content
def __iter__(self):
children = self.root.children if self.root else []
cs = [c for c in children if c.heading == self.heading_name]
if len(cs) != 1:
self.status["ALL"] = ("NO_HEADING", -1)
return
for section in cs[0].children:
name = section.heading
if len(section.content.strip()) == 0:
self.status[name] = ("NO_CONTENT", 0)
else:
self.status[name] = ("OK", 0)
m = re.search(r"\[([\w\- ]+)\]", name)
if m:
mvd = m.group(1)
li = [c for c in name]
li[slice(*m.span())] = []
name = (mvd, "".join(li).strip())
self.children[name] = section.children
yield name, section.first_node_content if self.short else section.content
def get_children(self, name):
children = self.root.children if self.root else []
cs = [c for c in children if c.heading == self.heading_name]
if len(cs) != 1:
return
for section in cs[0].children:
section_name = section.heading
if section_name == name:
return {c.heading: c.content for c in section.children}
if __name__ == "__main__":
import tabulate
fn = os.path.join(os.path.dirname(__file__), "../docs/schemas/resource/IfcActorResource/Entities/IfcActorRole.md")
mdp = markdown_attribute_parser(fn=fn, as_text=True, short=True)
print(mdp.definition(short=True))
print(tabulate.tabulate(list(mdp)))