Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions itn/chinese/data/measure/units_en.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
千卡 kcal
千克力 kgf
千克 kg
公斤 kg
千赫兹 khz
平方千米 km²
公里 km
Expand Down
14 changes: 14 additions & 0 deletions itn/chinese/rules/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,13 @@ def build_tagger(self):
(number + accep('亿') + delete('零').ques).ques + number)
# 负的xxx 1.11, 1.01
number = sign.ques + number + (dot + digits.plus).ques
# 五六万,三五千,六七百,三四十
number |= add_weight(
(digit + insert("0~") + digit + cross("十", "0")) |
(digit + insert("00~") + digit + cross("百", "00")) |
(digit + insert("000~") + digit + cross("千", "000")) |
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
)
self.number = number.optimize()

# 十/百/千/万
Expand All @@ -87,6 +94,13 @@ def build_tagger(self):
(number_exclude_0_to_9 | digits) +
(dot + digits.plus).plus
)
# 五六万,三五千,六七百,三四十
number_exclude_0_to_9 |= add_weight(
(digit + insert("0~") + digit + cross("十", "0")) |
(digit + insert("00~") + digit + cross("百", "00")) |
(digit + insert("000~") + digit + cross("千", "000")) |
(digit + insert("0000~") + digit + cross("万", "0000")), -1.0
)
self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa

# cardinal string like 127.0.0.1, used in ID, IP, etc.
Expand Down
3 changes: 3 additions & 0 deletions itn/chinese/rules/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def __init__(self, exclude_one=True, enable_0_to_9=True):
def build_tagger(self):
units_en = string_file('itn/chinese/data/measure/units_en.tsv')
units_zh = string_file('itn/chinese/data/measure/units_zh.tsv')
digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9
sign = string_file('itn/chinese/data/number/sign.tsv') # + -
to = cross('到', '~') | cross('到百分之', '~')

Expand All @@ -48,6 +49,8 @@ def build_tagger(self):

# 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h
measure = number + (to + number).ques + units
# 七八块钱
measure |= add_weight(digit + insert("~") + digit + units, -1.0)
tagger = insert('value: "') + (measure | percent) + insert('"')

# 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h
Expand Down
5 changes: 5 additions & 0 deletions itn/chinese/test/data/cardinal.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
幺幺零 => 110
幺二七点零点零点幺 => 127.0.0.1
这是手机一八五四四一三九一二一 => 这是手机18544139121
三五百 => 300~500
三五千 => 3000~5000
三五万 => 30000~50000
三四万 => 30000~40000
五六十 => 50~60
5 changes: 5 additions & 0 deletions itn/chinese/test/data/measure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,8 @@
百分之三十一到百分之百 => 31~100%
十一到一百千米每小时 => 11~100km/h
每小时三十到三百一十一千米 => 30~311km/h
七八公斤 => 7~8kg
五六十块钱 => 50~60块钱
三五百公里 => 300~500km
八九千美元 => $8000~9000
三四万吨 => 30000~40000吨