-
Notifications
You must be signed in to change notification settings - Fork 92
/
Copy pathextract_definition.py
114 lines (98 loc) · 4.2 KB
/
extract_definition.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import re
import os
import html
MARKER = "\n\n<!-- end of short definition -->\n"
MULTIPLE_LINEBREAK_PATTERN = re.compile("\n+")
FORMULA_PATTERN = re.compile(r'(Base|General) formula')
DOLAR_PATTERN = re.compile(r'\$\$')
ASTERIX_PATTERN = re.compile(r'\*\*')
SENTENCE_WITH_PSET_PATTERN = re.compile(r'[^.?!]*\b(_Pset|_Qto)\b[^.?!]*[.?!]')
HEADING_PATTERN = re.compile(r'\#+')
KEYWORDS = ['NOTE', 'Note: ', 'DIAGRAM', 'CHANGE', 'IFC4', 'HISTORY',
'REFERENCE', 'EXAMPLE', 'DEPRECATION']
def extract_definition(txt, return_short=True, return_marked=False, print_split=False):
""" Parse the original text and return only the semantic definition
or place a marker at its end. """
heading = txt.split('\n\n', 1)[0]+'\n\n'
txt = txt[len(heading):]
if not return_short and not return_marked:
return txt
try:
txt = html.unescape(txt)
except TypeError:
# Error when content is a link, for example:
# "https://github.com/buildingSMART/IFC4.3.x-development/edit/master/docs/schemas/core/
# IfcProductExtension/Types/IfcAlignmentTypeEnum.md#L0 has no content"
txt = ''
s1 = re.search(DOLAR_PATTERN, txt).start() if re.search(DOLAR_PATTERN, txt) else -1
s2 = re.search(FORMULA_PATTERN, txt).start() if re.search(FORMULA_PATTERN, txt) else -1
s3 = re.search(MULTIPLE_LINEBREAK_PATTERN, txt).start() if re.search(MULTIPLE_LINEBREAK_PATTERN,
txt) else -1
s4 = -1
for k in KEYWORDS:
x = txt.find(k)
if x != -1 and (s4 == -1 or x < s4):
# we found a match without a previous match, or in front of the previous match
s4 = x
s5 = re.search(SENTENCE_WITH_PSET_PATTERN, txt).start() if re.search(SENTENCE_WITH_PSET_PATTERN,
txt) else -1
s6 = re.search(HEADING_PATTERN, txt).start() if re.search(HEADING_PATTERN, txt) else -1
i = min([x for x in [s1, s2, s3, s4, s5, s6, 1e5] if x >= 0])
if txt[i-1] in [":","-","–"]:
i = find_last_bullet_end_position(txt, i)
if i >= 0 and i != 1e5:
if print_split:
print(
"\n\n\033[92m"
+ heading
+ txt[:i].rstrip()
+ "\033[91m"
+ txt[i : i + 500]
+ "...\033[0m"
)
if return_short:
return txt[:i]
elif return_marked:
return heading + txt[:i].rstrip() + MARKER + txt[i:]
else:
return ""
else:
return txt
def find_last_bullet_end_position(text, i):
""" Finds the last bullet point of the current list. """
lines = text[i:].split('\n')
bullet_length = 0
for line in lines:
if line.strip() == '':
bullet_length += 1
elif line.strip().startswith('*'):
bullet_length += len(line) +1
elif line.strip().startswith('and'):
bullet_length += len(line) +1
else:
return i + bullet_length -1
def enrich_all_markdowns(directory_path, save=False):
"""Parse all markdown files in subdirectories and process their definitions
"""
for root, dirs, files in os.walk(directory_path):
for file in files:
if file[len(file)-3:] == '.md' and file != 'README.md':
file_path = os.path.join(root, file)
try:
with open(file_path, 'r', encoding='utf-8') as file:
new_def = file.read()
new_def = new_def.replace(" ", " ")
new_def = extract_definition(
new_def,
return_short=False,
return_marked=True,
print_split=True,
)
if save:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(new_def)
except Exception as e:
print(f"Failed to read {file_path}: {e}")
if __name__ == "__main__":
DIR_PATH = r".\docs\schemas"
enrich_all_markdowns(DIR_PATH, save=False)