/
postprocess.py
103 lines (90 loc) · 3.2 KB
/
postprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
import sys
ENGLISH_UNAVAILABLE = (
"<!-- begin en only -->\n"
+ '<p style="font-style:italic">English text is not available in this practice contest.</p>\n'
+ "<!-- end en only -->\n"
)
def sub_while_unchange(pattern, repl, str):
prv_str = ""
while not prv_str == str:
prv_str = str
str = re.sub(pattern, repl, str)
return str
def postprocess(html_text):
html_text = re.sub("<blockquote>\n<p>", "<blockquote>", html_text)
html_text = re.sub("</p>\n</blockquote>", "</blockquote>", html_text)
html_text = re.sub(r"<pre(.*)><code(.*)>", r"<blockquote>", html_text)
html_text = re.sub(r"</code></pre>", r"</blockquote>", html_text)
html_text = re.sub(r"\{,\}", ",", html_text)
html_text = re.sub(r"\$(.*?)\$", r"<i>\1</i>", html_text)
html_text = sub_while_unchange(
r"<i>(.*?)\_\{(.*?)\}(.*?)</i>", r"<i>\1<sub>\2</sub>\3</i>", html_text
)
html_text = sub_while_unchange(
r"<i>(.*?)\_(.)(.*?)\</i>", r"<i>\1<sub>\2</sub>\3</i>", html_text
)
html_text = sub_while_unchange(
r"<i>(.*?)\^\{(.*?)\}(.*?)</i>", r"<i>\1<sup>\2</sup>\3</i>", html_text
)
html_text = sub_while_unchange(
r"<i>(.*?)\^(.)(.*?)\</i>", r"<i>\1<sup>\2</sup>\3</i>", html_text
)
html_text = re.sub(r"\\leq", r"≤", html_text)
html_text = re.sub(r"\\le", r"≤", html_text)
html_text = re.sub(r"\\geq", r"≥", html_text)
html_text = re.sub(r"\\ge", r"≥", html_text)
html_text = re.sub(r"\\neq", r"≠", html_text)
html_text = re.sub(r"\\ne", r"≠", html_text)
html_text = re.sub(r"\\ldots", r"...", html_text)
html_text = re.sub(r"\\vdots", r"...", html_text)
html_text = re.sub(r"\\sum", r"Σ", html_text)
html_text = re.sub(r"\\min", r"min", html_text)
html_text = re.sub(r"\\max", r"max", html_text)
html_text = re.sub(r"\\in", r"∈", html_text)
html_text = re.sub(r"\\times", "×", html_text)
html_text = re.sub("<table>", '<table class="c_table">', html_text)
html_text = re.sub("<th>", '<th class="c_th">', html_text)
html_text = re.sub("<td>", '<td class="c_td">', html_text)
html_text = re.sub("<thead>", '<thead class="c_thead">', html_text)
# add 'ja only' and 'en only' tag
# html_text = re.sub(r'<h3>(?![Input|Output|Problem|入力|出力|入出力|Sample])(.*)</h3>', r'<h3>\1</h3>\n<div>\n' +
# ENGLISH_UNAVAILABLE + '<!-- begin ja only -->\n', html_text)
html_text = re.sub(
r"<h3>",
r"</div>\n<h3>",
html_text,
)
html_text = re.sub(
r"</h3>",
r"</h3>\n<div>",
html_text,
)
html_text = re.sub(
r"</div>\n<h3>?",
r"<h3>",
html_text,
1,
)
html_text = re.sub(
r"([\s\S]*)</td>",
r"\1</div>\n</td>",
html_text,
1,
)
html_text = re.sub(
r"([\s\S]*?)<h2>(.*?)</h2>(\s*?)</div>([\s\S]*?)<h3>",
r"\1</div>\n<h2>\2</h2>\3\4<h3>",
html_text,
)
html_text = re.sub(
"<h3></h3>",
"",
html_text,
)
return html_text
def main():
html_text = sys.stdin.read()
print(postprocess(html_text))
if __name__ == "__main__":
main()