-
Notifications
You must be signed in to change notification settings - Fork 6
/
yomituki.py
106 lines (85 loc) · 2.61 KB
/
yomituki.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# coding: utf-8
import itertools
import bs4
from janome.tokenizer import Tokenizer
from pykakasi import kakasi
to = Tokenizer()
kakasi = kakasi()
kakasi.setMode("K", "H")
conv = kakasi.getConverter()
def is_kana(character):
if 12353 <= ord(character) <= 12542:
return True
return False
# def splitter(org):
# start = 0
# iskana_p = False
# last_kanji = -1
# for index, iskana in enumerate(map(is_kana, org)):
# if iskana_p and not iskana:
# yield org[start:index], last_kanji - start
# start = index
# if not iskana:
# last_kanji = index
# iskana_p = iskana
# yield org[start:], last_kanji - start
#
#
# def analyzer(org, yomi):
# _yomi = yomi[:]
# pindex = 0
# for word, index in list(splitter(org))[::-1]:
# _yomi = _yomi[:pindex] if pindex else _yomi
# kana_part = word[index+1:]
# yield _yomi[_yomi.rfind(kana_part) - 1:]
# yield kana_part
# pindex -= len(kana_part)
def hantei(word):
org = word.surface
kata = word.reading
if org == kata or kata == '*':
return (org,), False, None
hira = conv.do(word.reading)
if org == hira:
return (org,), False, None
else:
return org, True, hira
def cut_end(org, hira):
if org[-1] != hira[-1]:
return (org, hira),
for i in range(1, len(org)):
if org[-i - 1] != hira[-i - 1]:
return (org[:-i], hira[:-i]), hira[-i:]
return org,
def yomituki(sentence):
return (cut_end(source, yomi) if ruby else source for source, ruby, yomi in map(hantei, to.tokenize(sentence)))
def ruby_wrap(org, yomi):
return '<ruby><rb>{}</rb><rp>(</rp><rt>{}</rt><rp>)</rp></ruby>'.format(org, yomi)
def ruby_text(text):
yomi = yomituki(text)
plain = ''
for i in itertools.chain.from_iterable(yomi):
if isinstance(i, str):
plain += i
else:
plain += ruby_wrap(*i)
return plain
def ruby_p(p):
plain = '<p>'
for i in p:
if isinstance(i, bs4.element.NavigableString):
plain += ruby_text(str(i))
elif isinstance(i, bs4.element.Tag):
plain += str(i)
plain += '</p>'
return plain
def ruby_div(div):
plain = '<div>'
for i in div:
if isinstance(i, bs4.element.NavigableString):
if i.strip():
plain += ruby_text(str(i))
elif isinstance(i, bs4.element.Tag):
plain += ruby_p(i)
plain += '</div>'
return plain