-
Notifications
You must be signed in to change notification settings - Fork 41
/
Copy pathget_notes.py
73 lines (48 loc) · 1.83 KB
/
get_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
__author__ = "ipetrash"
from bs4 import BeautifulSoup
from common import get_attribute_value_by_local_name
def get_note_links(root) -> list[tuple[str, str]]:
# в стиле Роберта Адама <a l:href="#note1" type="note">[1]</a>
note_link_list = root.select('a[type="note"]')
items = []
for link in note_link_list:
href = get_attribute_value_by_local_name(link, "href")
text = link.text.strip()
items.append((href, text))
return items
def get_notes(root) -> list[tuple[str, str, str]]:
# Пример тега:
# <body name="notes">
# <title>
# <p>Примечания</p>
# </title>
# <section id="v1_ch1_cite_note-1">
# <title>
# <p>1</p>
# </title>
# <p>Онии-сама: Если кто не знает, это уважительное обращение к старшему брату.
# </section>
notes_list = root.select('body[name="notes"] > section')
items = []
for note in notes_list:
note_id = note.attrs["id"]
title = note.title.text.strip()
# Удаление <title>
note.title.decompose()
# Теперь можно взять текст -- не попадет заголовок
text = note.text.strip()
items.append((note_id, title, text))
return items
if __name__ == "__main__":
import glob
for fb2_file_name in glob.glob("input/*.fb2"):
with open(fb2_file_name, encoding="utf-8") as f:
root = BeautifulSoup(f, "html.parser")
print(fb2_file_name)
note_links = get_note_links(root)
print("note_links:", note_links)
notes = get_notes(root)
print("notes:", notes)
print("\n")