-
Notifications
You must be signed in to change notification settings - Fork 4
/
hyphenate.py
128 lines (109 loc) · 2.62 KB
/
hyphenate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
""" High precision hyphenator
Uses a list of segments to attempt to hyphenate words
Then performs corrections to move hyphens in between double consonants
and such
"""
import regex
segments = {
"dhamma",
"putta",
"deva",
"khema",
"vibhaṅga",
"suñña",
"mutta",
"gotta",
"yata",
"mogga",
"sevi",
"saṅk",
"rīsa",
"mahā",
"pari",
"bodhi",
"vitakka",
"bahu",
"khemā",
"ratha",
'rāja',
'nibbāna',
'sati',
'dukkha',
'vinī',
'gatā',
'cūḷa',
'sacca',
'rāhu',
'piṇḍi',
'Ānanda',
'bhadde',
'kaḷā',
'bara',
'indriya',
'sakula',
'samaṇa',
'giri',
'kumāra',
'bala',
'thulla',
'caṇḍala',
'pokkha',
'loma',
'kana',
'iccha',
'aṅguttara',
'kattha',
'koccha',
'nimmā',
'eka',
'hatthi',
'pada',
'saka',
'bāla',
'komāra',
'sammā',
'diṭṭhi',
'tiṭṭhi',
'patti',
'janīya',
'thaddha',
'kopama',
'gamā',
'dūpama',
'bhacca',
'khamma',
'kacca',
'puṇḍa'
}
cons = "(?:br|[kgcjtṭdḍbp]h|[kgcjtṭdḍp](?!h)|[mnyrlvshṅṇṃṁñḷ]|b(?![rh]))";
vowel_chars = 'aioueāīū'
vowel_pattern = '[' + vowel_chars.lower() + ']'
vowel_antipattern = '[^' + vowel_chars.lower() + '-]'
segments_revoweled = [regex.sub(vowel_pattern + '$', vowel_pattern, segment, flags=regex.I) for segment in sorted(segments, key=len, reverse=True)]
segment_rex = regex.compile('({})'.format("|".join(segments_revoweled)), flags=regex.I)
alpha_rex = regex.compile(r'\p{alpha}+')
def add_hyphens(match):
segment = match[0]
if segment[0] not in vowels:
segment = '-' + segment
if segment[-1] not in vowels:
segment = segment + '-'
return segment
def fix_hyphens(word):
for i in range(0, 2):
word = regex.sub(r'-({})({})'.format(cons, cons), r'\1-\2', word, flags=regex.I)
word = regex.sub(r'([kgcjḍṭdtpb])-(h{})'.format(vowel_pattern), r'\1\2-', word, flags=regex.I)
word = regex.sub(r'^(\p{alpha}{0,3})-', r'\1', word)
word = regex.sub(r'-(\p{alpha}{0,3})$', r'\1', word)
return word
def hyphenate(word, max_length):
if len(word) <= max_length:
return word
word = segment_rex.sub(r'-\1-', word)
word = word.replace('--', '-')
word = word.strip('-')
word = fix_hyphens(word)
for segment in alpha_rex.findall(word):
if len(segment) > max_length:
print('Segment too long: {}'.format(segment))
return word.replace('-', '\xad')