-
-
Notifications
You must be signed in to change notification settings - Fork 33.3k
/
Copy pathlanguage.py
210 lines (166 loc) · 5.74 KB
/
language.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
"""Helper methods for language selection in Home Assistant."""
from __future__ import annotations
from collections.abc import Iterable
from dataclasses import dataclass
import math
import operator
import re
from homeassistant.const import MATCH_ALL
SEPARATOR_RE = re.compile(r"[-_]")
SAME_LANGUAGES = (
# no = spoken Norwegian
# nb = written Norwegian (Bokmål)
("nb", "no"),
# he = Hebrew new code
# iw = Hebrew old code
("he", "iw"),
)
def preferred_regions(
language: str,
country: str | None = None,
code: str | None = None,
) -> Iterable[str]:
"""Yield an ordered list of regions for a language based on country/code hints.
Regions should be checked for support in the returned order if no other
information is available.
"""
if country is not None:
yield country.upper()
if language == "en":
# Prefer U.S. English if no country
if country is None:
yield "US"
elif language == "zh":
if code == "Hant":
yield "HK"
yield "TW"
else:
yield "CN"
# fr -> fr-FR
yield language.upper()
def is_region(language: str, region: str | None) -> bool:
"""Return true if region is not known to be a script/code instead."""
if language == "es":
return region != "419"
if language == "sr":
return region != "Latn"
if language == "zh":
return region not in ("Hans", "Hant")
return True
def is_language_match(lang_1: str, lang_2: str) -> bool:
"""Return true if two languages are considered the same."""
if lang_1 == lang_2:
# Exact match
return True
if tuple(sorted([lang_1, lang_2])) in SAME_LANGUAGES:
return True
return False
@dataclass
class Dialect:
"""Language with optional region and script/code."""
language: str
region: str | None
code: str | None = None
def __post_init__(self) -> None:
"""Fix casing of language/region."""
# Languages are lower-cased
self.language = self.language.casefold()
if self.region is not None:
# Regions are upper-cased
self.region = self.region.upper()
def score(
self, dialect: Dialect, country: str | None = None
) -> tuple[float, float]:
"""Return score for match with another dialect where higher is better.
Score < 0 indicates a failure to match.
"""
if not is_language_match(self.language, dialect.language):
# Not a match
return (-1, 0)
is_exact_language = self.language == dialect.language
if (self.region is None) and (dialect.region is None):
# Weak match with no region constraint
# Prefer exact language match
return (2 if is_exact_language else 1, 0)
if (self.region is not None) and (dialect.region is not None):
if self.region == dialect.region:
# Same language + region match
# Prefer exact language match
return (
math.inf,
1 if is_exact_language else 0,
)
# Regions are both set, but don't match
return (0, 0)
# Generate ordered list of preferred regions
pref_regions = list(
preferred_regions(
self.language,
country=country,
code=self.code,
)
)
try:
# Determine score based on position in the preferred regions list.
if self.region is not None:
region_idx = pref_regions.index(self.region)
elif dialect.region is not None:
region_idx = pref_regions.index(dialect.region)
# More preferred regions are at the front.
# Add 1 to boost above a weak match where no regions are set.
return (1 + (len(pref_regions) - region_idx), 0)
except ValueError:
# Region was not in preferred list
pass
# Not a preferred region
return (0, 0)
@staticmethod
def parse(tag: str) -> Dialect:
"""Parse language tag into language/region/code."""
parts = SEPARATOR_RE.split(tag, maxsplit=1)
language = parts[0]
region: str | None = None
code: str | None = None
if len(parts) > 1:
region_or_code = parts[1]
if is_region(language, region_or_code):
# US, GB, etc.
region = region_or_code
else:
# Hant, 419, etc.
code = region_or_code
return Dialect(
language=language,
region=region,
code=code,
)
def matches(
target: str, supported: Iterable[str], country: str | None = None
) -> list[str]:
"""Return a sorted list of matching language tags based on a target tag and country hint."""
if target == MATCH_ALL:
return list(supported)
target_dialect = Dialect.parse(target)
# Higher score is better
scored = sorted(
(
(
dialect := Dialect.parse(tag),
target_dialect.score(dialect, country=country),
tag,
)
for tag in supported
),
key=operator.itemgetter(1),
reverse=True,
)
# Score < 0 is not a match
return [tag for _dialect, score, tag in scored if score[0] >= 0]
def intersect(languages_1: set[str], languages_2: set[str]) -> set[str]:
"""Intersect two sets of languages using is_match for aliases."""
languages = set()
for lang_1 in languages_1:
for lang_2 in languages_2:
if is_language_match(lang_1, lang_2):
languages.add(lang_1)
return languages