-
Notifications
You must be signed in to change notification settings - Fork 102
fix(itn): set 0to9 for measure & money #109
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -14,7 +14,7 @@ | |||||||||||
|
|
||||||||||||
| from tn.processor import Processor | ||||||||||||
|
|
||||||||||||
| from pynini import string_file | ||||||||||||
| from pynini import string_file, accep | ||||||||||||
| from pynini.lib.pynutil import delete, insert | ||||||||||||
|
|
||||||||||||
|
|
||||||||||||
|
|
@@ -29,25 +29,31 @@ def build_tagger(self): | |||||||||||
| digit = string_file('itn/chinese/data/number/digit.tsv') # 1 ~ 9 | ||||||||||||
| zero = string_file('itn/chinese/data/number/zero.tsv') # 0 | ||||||||||||
|
|
||||||||||||
| yyyy = digit + (digit | zero)**3 | ||||||||||||
| yy = digit**2 | ||||||||||||
| yyyy = digit + (digit | zero)**3 # 二零零八年 | ||||||||||||
| yyy = digit + (digit | zero)**2 # 公元一六八年 | ||||||||||||
| yy = (digit | zero)**2 # 零八年奥运会 | ||||||||||||
|
Comment on lines
+32
to
+34
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个是为了过单元测试,顺手附带修的bug,现在单元测试分成了四组,若这里不添加yy,则其中一组单元测试过不去:
|
||||||||||||
| mm = string_file('itn/chinese/data/date/mm.tsv') | ||||||||||||
| dd = string_file('itn/chinese/data/date/dd.tsv') | ||||||||||||
|
|
||||||||||||
| year = insert('year: "') + (yyyy | yy) + delete('年') + insert('" ') | ||||||||||||
| year = insert('year: "') + (yyyy | yyy | yy) + \ | ||||||||||||
| delete('年') + insert('" ') | ||||||||||||
| year_only = insert('year: "') + (yyyy | yyy | yy) + \ | ||||||||||||
| accep('年') + insert('"') | ||||||||||||
| month = insert('month: "') + mm + insert('"') | ||||||||||||
| day = insert(' day: "') + dd + insert('"') | ||||||||||||
|
|
||||||||||||
| # yyyy/mm/dd | yyyy/mm | mm/dd | ||||||||||||
| # yyyy/mm/dd | yyyy/mm | mm/dd | yyyy | ||||||||||||
| date = ((year + month + day) | ||||||||||||
| | (year + month) | ||||||||||||
| | (month + day)) | ||||||||||||
| | (month + day)) | year_only | ||||||||||||
| self.tagger = self.add_tokens(date) | ||||||||||||
|
|
||||||||||||
| def build_verbalizer(self): | ||||||||||||
| addsign = insert("/") | ||||||||||||
| year = delete('year: "') + self.SIGMA + delete('" ') | ||||||||||||
| year_only = delete('year: "') + self.SIGMA + delete('"') | ||||||||||||
| month = delete('month: "') + self.SIGMA + delete('"') | ||||||||||||
| day = delete(' day: "') + self.SIGMA + delete('"') | ||||||||||||
| verbalizer = (year + addsign).ques + month + (addsign + day).ques | ||||||||||||
| verbalizer |= year_only | ||||||||||||
| self.verbalizer = self.delete_tokens(verbalizer) | ||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,9 +21,10 @@ | |
|
|
||
| class Measure(Processor): | ||
|
|
||
| def __init__(self, exclude_one=True): | ||
| def __init__(self, exclude_one=True, enable_0_to_9=True): | ||
| super().__init__(name='measure') | ||
| self.exclude_one = exclude_one | ||
| self.enable_0_to_9 = enable_0_to_9 | ||
| self.build_tagger() | ||
| self.build_verbalizer() | ||
|
|
||
|
|
@@ -34,17 +35,15 @@ def build_tagger(self): | |
| units = add_weight(units_en, -1.0) | \ | ||
| ((accep('亿') | accep('兆') | accep('万')).ques + units_zh) | ||
|
|
||
| number = Cardinal().number | ||
| number = Cardinal().number if self.enable_0_to_9 else \ | ||
| Cardinal().number_exclude_0_to_9 | ||
| # 百分之三十, 百分三十, 百分之百 | ||
| percent = ((sign + delete('的').ques).ques + delete('百分') + | ||
| delete('之').ques + (number | cross('百', '100')) | ||
| delete('之').ques + (Cardinal().number | cross('百', '100')) | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里没用number而是用cardinal().number是因为百分数不应该区分是否0~9,比如“百分之二”理应被转换为“2%” |
||
| + insert('%')) | ||
|
|
||
| # 十千米每小时 => 10km/h | ||
| measure = number + units | ||
| if self.exclude_one: | ||
| measure |= number + number.plus + units | ||
| measure |= (add_weight(accep('一'), -1.0) + units_zh) | ||
| tagger = insert('value: "') + (measure | percent) + insert('"') | ||
|
|
||
| # 每小时十千米 => 10km/h | ||
|
|
||
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
之前enable_0_to_9只有在cardinal的tagger中才生效,在遇到量词时 (比如“九天”) measure中用的是cardinal.number,而非cardinal.tagger,此时设置enable_0_to_9=False,“九天”依旧会被转成“9天”
