From 5d21469c9091764b972a31561b5346669e52ff1d Mon Sep 17 00:00:00 2001 From: xingchensong Date: Thu, 12 Oct 2023 18:15:21 +0800 Subject: [PATCH] =?UTF-8?q?fix(itn):=20=E4=B8=83=E5=85=AB=E5=85=AC?= =?UTF-8?q?=E6=96=A4=20=3D>=207~8kg,=20=E4=B8=83=E5=85=AB=E7=99=BE?= =?UTF-8?q?=E5=9D=97=20=3D>=20700~800=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- itn/chinese/data/measure/units_en.tsv | 1 + itn/chinese/rules/cardinal.py | 14 ++++++++++++++ itn/chinese/rules/measure.py | 3 +++ itn/chinese/test/data/cardinal.txt | 5 +++++ itn/chinese/test/data/measure.txt | 5 +++++ 5 files changed, 28 insertions(+) diff --git a/itn/chinese/data/measure/units_en.tsv b/itn/chinese/data/measure/units_en.tsv index e045fb1c..d13d358c 100644 --- a/itn/chinese/data/measure/units_en.tsv +++ b/itn/chinese/data/measure/units_en.tsv @@ -24,6 +24,7 @@ 千卡 kcal 千克力 kgf 千克 kg +公斤 kg 千赫兹 khz 平方千米 km² 公里 km diff --git a/itn/chinese/rules/cardinal.py b/itn/chinese/rules/cardinal.py index 967ab22a..a9b5f207 100644 --- a/itn/chinese/rules/cardinal.py +++ b/itn/chinese/rules/cardinal.py @@ -72,6 +72,13 @@ def build_tagger(self): (number + accep('亿') + delete('零').ques).ques + number) # 负的xxx 1.11, 1.01 number = sign.ques + number + (dot + digits.plus).ques + # 五六万,三五千,六七百,三四十 + number |= add_weight( + (digit + insert("0~") + digit + cross("十", "0")) | + (digit + insert("00~") + digit + cross("百", "00")) | + (digit + insert("000~") + digit + cross("千", "000")) | + (digit + insert("0000~") + digit + cross("万", "0000")), -1.0 + ) self.number = number.optimize() # 十/百/千/万 @@ -87,6 +94,13 @@ def build_tagger(self): (number_exclude_0_to_9 | digits) + (dot + digits.plus).plus ) + # 五六万,三五千,六七百,三四十 + number_exclude_0_to_9 |= add_weight( + (digit + insert("0~") + digit + cross("十", "0")) | + (digit + insert("00~") + digit + cross("百", "00")) | + (digit + insert("000~") + digit + cross("千", "000")) | + (digit + insert("0000~") + digit + cross("万", "0000")), -1.0 + ) self.number_exclude_0_to_9 = (sign.ques + number_exclude_0_to_9).optimize() # noqa # cardinal string like 127.0.0.1, used in ID, IP, etc. diff --git a/itn/chinese/rules/measure.py b/itn/chinese/rules/measure.py index 2415cdc4..17d1ec10 100644 --- a/itn/chinese/rules/measure.py +++ b/itn/chinese/rules/measure.py @@ -31,6 +31,7 @@ def __init__(self, exclude_one=True, enable_0_to_9=True): def build_tagger(self): units_en = string_file('itn/chinese/data/measure/units_en.tsv') units_zh = string_file('itn/chinese/data/measure/units_zh.tsv') + digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9 sign = string_file('itn/chinese/data/number/sign.tsv') # + - to = cross('到', '~') | cross('到百分之', '~') @@ -48,6 +49,8 @@ def build_tagger(self): # 十千米每小时 => 10km/h, 十一到一百千米每小时 => 11~100km/h measure = number + (to + number).ques + units + # 七八块钱 + measure |= add_weight(digit + insert("~") + digit + units, -1.0) tagger = insert('value: "') + (measure | percent) + insert('"') # 每小时十千米 => 10km/h, 每小时三十到三百一十一千米 => 30~311km/h diff --git a/itn/chinese/test/data/cardinal.txt b/itn/chinese/test/data/cardinal.txt index 002edde5..8316fee1 100644 --- a/itn/chinese/test/data/cardinal.txt +++ b/itn/chinese/test/data/cardinal.txt @@ -1,3 +1,8 @@ 幺幺零 => 110 幺二七点零点零点幺 => 127.0.0.1 这是手机一八五四四一三九一二一 => 这是手机18544139121 +三五百 => 300~500 +三五千 => 3000~5000 +三五万 => 30000~50000 +三四万 => 30000~40000 +五六十 => 50~60 diff --git a/itn/chinese/test/data/measure.txt b/itn/chinese/test/data/measure.txt index 3499a0fb..a58b975a 100644 --- a/itn/chinese/test/data/measure.txt +++ b/itn/chinese/test/data/measure.txt @@ -32,3 +32,8 @@ 百分之三十一到百分之百 => 31~100% 十一到一百千米每小时 => 11~100km/h 每小时三十到三百一十一千米 => 30~311km/h +七八公斤 => 7~8kg +五六十块钱 => 50~60块钱 +三五百公里 => 300~500km +八九千美元 => $8000~9000 +三四万吨 => 30000~40000吨