Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion inverse_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def main():
parser.add_argument('--text', help='input string')
parser.add_argument('--file', help='input file path')
parser.add_argument('--overwrite_cache', action='store_true',
help='rebuild *.far')
help='rebuild *.fst')
parser.add_argument('--enable_standalone_number', type=str,
default='True',
help='enable standalone number')
Expand Down
2 changes: 1 addition & 1 deletion normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def main():
parser.add_argument('--text', help='input string')
parser.add_argument('--file', help='input file path')
parser.add_argument('--overwrite_cache', action='store_true',
help='rebuild *.far')
help='rebuild *.fst')
args = parser.parse_args()

normalizer = Normalizer(cache_dir='tn',
Expand Down
16 changes: 3 additions & 13 deletions runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,19 +29,9 @@ endif()
include(openfst)
include_directories(${PROJECT_SOURCE_DIR})

add_library(processor STATIC
processor/processor.cc
processor/token_parser.cc
utils/utf8_string.cc
)
if(MSVC)
target_link_libraries(processor PUBLIC fst)
else()
target_link_libraries(processor PUBLIC dl fst)
endif()

add_executable(processor_main bin/processor_main.cc)
target_link_libraries(processor_main PUBLIC processor)
add_subdirectory(utils)
add_subdirectory(processor)
add_subdirectory(bin)

if(BUILD_TESTING)
include(gtest)
Expand Down
2 changes: 2 additions & 0 deletions runtime/bin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
add_executable(processor_main processor_main.cc)
target_link_libraries(processor_main PUBLIC processor)
8 changes: 4 additions & 4 deletions runtime/bin/processor_main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,19 +34,19 @@ int main(int argc, char* argv[]) {
wetext::Processor processor(FLAGS_tagger, FLAGS_verbalizer);

if (!FLAGS_text.empty()) {
std::string tagged_text = processor.tag(FLAGS_text);
std::string tagged_text = processor.Tag(FLAGS_text);
std::cout << tagged_text << std::endl;
std::string normalized_text = processor.verbalize(tagged_text);
std::string normalized_text = processor.Verbalize(tagged_text);
std::cout << normalized_text << std::endl;
}

if (!FLAGS_file.empty()) {
std::ifstream file(FLAGS_file);
std::string line;
while (getline(file, line)) {
std::string tagged_text = processor.tag(line);
std::string tagged_text = processor.Tag(line);
std::cout << tagged_text << std::endl;
std::string normalized_text = processor.verbalize(tagged_text);
std::string normalized_text = processor.Verbalize(tagged_text);
std::cout << normalized_text << std::endl;
}
}
Expand Down
9 changes: 9 additions & 0 deletions runtime/processor/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
add_library(processor STATIC
processor.cc
token_parser.cc
)
if(MSVC)
target_link_libraries(processor PUBLIC fst utils)
else()
target_link_libraries(processor PUBLIC dl fst utils)
endif()
23 changes: 13 additions & 10 deletions runtime/processor/processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Processor::Processor(const std::string& tagger_path,
}
}

std::string Processor::shortest_path(const StdVectorFst& lattice) {
std::string Processor::ShortestPath(const StdVectorFst& lattice) {
StdVectorFst shortest_path;
fst::ShortestPath(lattice, &shortest_path, 1, true);

Expand All @@ -43,31 +43,34 @@ std::string Processor::shortest_path(const StdVectorFst& lattice) {
return output;
}

std::string Processor::compose(const std::string& input,
std::string Processor::Compose(const std::string& input,
const StdVectorFst* fst) {
StdVectorFst input_fst;
compiler_->operator()(input, &input_fst);

StdVectorFst lattice;
fst::Compose(input_fst, *fst, &lattice);
return shortest_path(lattice);
return ShortestPath(lattice);
}

std::string Processor::tag(const std::string& input) {
return compose(input, tagger_.get());
std::string Processor::Tag(const std::string& input) {
return Compose(input, tagger_.get());
}

std::string Processor::verbalize(const std::string& input) {
std::string Processor::Verbalize(const std::string& input) {
if (input.empty()) {
return "";
}
TokenParser parser(parse_type_);
std::string output = parser.reorder(input);
return compose(output, verbalizer_.get());
std::string output = parser.Reorder(input);

output = Compose(output, verbalizer_.get());
output.erase(std::remove(output.begin(), output.end(), '\0'), output.end());
return output;
}

std::string Processor::normalize(const std::string& input) {
return verbalize(tag(input));
std::string Processor::Normalize(const std::string& input) {
return Verbalize(Tag(input));
}

} // namespace wetext
10 changes: 5 additions & 5 deletions runtime/processor/processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ namespace wetext {
class Processor {
public:
Processor(const std::string& tagger_path, const std::string& verbalizer_path);
std::string tag(const std::string& input);
std::string verbalize(const std::string& input);
std::string normalize(const std::string& input);
std::string Tag(const std::string& input);
std::string Verbalize(const std::string& input);
std::string Normalize(const std::string& input);

private:
std::string shortest_path(const StdVectorFst& lattice);
std::string compose(const std::string& input, const StdVectorFst* fst);
std::string ShortestPath(const StdVectorFst& lattice);
std::string Compose(const std::string& input, const StdVectorFst* fst);

ParseType parse_type_;
std::shared_ptr<StdVectorFst> tagger_ = nullptr;
Expand Down
112 changes: 56 additions & 56 deletions runtime/processor/token_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#include "processor/token_parser.h"

#include "utils/log.h"
#include "utils/utf8_string.h"
#include "utils/string.h"

namespace wetext {
const std::string EOS = "<EOS>";
Expand All @@ -41,113 +41,113 @@ const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {

TokenParser::TokenParser(ParseType type) {
if (type == ParseType::kTN) {
orders = TN_ORDERS;
orders_ = TN_ORDERS;
} else {
orders = ITN_ORDERS;
orders_ = ITN_ORDERS;
}
}

void TokenParser::load(const std::string& input) {
string2chars(input, &text);
CHECK_GT(text.size(), 0);
index = 0;
ch = text[0];
void TokenParser::Load(const std::string& input) {
SplitUTF8StringToChars(input, &text_);
CHECK_GT(text_.size(), 0);
index_ = 0;
ch_ = text_[0];
}

bool TokenParser::read() {
if (index < text.size() - 1) {
index += 1;
ch = text[index];
bool TokenParser::Read() {
if (index_ < text_.size() - 1) {
index_ += 1;
ch_ = text_[index_];
return true;
}
ch = EOS;
ch_ = EOS;
return false;
}

bool TokenParser::parse_ws() {
bool not_eos = ch != EOS;
while (not_eos && ch == " ") {
not_eos = read();
bool TokenParser::ParseWs() {
bool not_eos = ch_ != EOS;
while (not_eos && ch_ == " ") {
not_eos = Read();
}
return not_eos;
}

bool TokenParser::parse_char(const std::string& exp) {
if (ch == exp) {
read();
bool TokenParser::ParseChar(const std::string& exp) {
if (ch_ == exp) {
Read();
return true;
}
return false;
}

bool TokenParser::parse_chars(const std::string& exp) {
bool TokenParser::ParseChars(const std::string& exp) {
bool ok = false;
std::vector<std::string> chars;
string2chars(exp, &chars);
SplitUTF8StringToChars(exp, &chars);
for (const auto& x : chars) {
ok |= parse_char(x);
ok |= ParseChar(x);
}
return ok;
}

std::string TokenParser::parse_key() {
CHECK_NE(ch, EOS);
CHECK_EQ(UTF8_WHITESPACE.count(ch), 0);
std::string TokenParser::ParseKey() {
CHECK_NE(ch_, EOS);
CHECK_EQ(UTF8_WHITESPACE.count(ch_), 0);

std::string key = "";
while (ASCII_LETTERS.count(ch) > 0) {
key += ch;
read();
while (ASCII_LETTERS.count(ch_) > 0) {
key += ch_;
Read();
}
return key;
}

std::string TokenParser::parse_value() {
CHECK_NE(ch, EOS);
std::string TokenParser::ParseValue() {
CHECK_NE(ch_, EOS);
bool escape = false;

std::string value = "";
while (ch != "\"") {
value += ch;
escape = ch == "\\" && !escape;
read();
while (ch_ != "\"") {
value += ch_;
escape = ch_ == "\\" && !escape;
Read();
if (escape) {
value += ch;
read();
value += ch_;
Read();
}
}
return value;
}

void TokenParser::parse(const std::string& input) {
load(input);
while (parse_ws()) {
std::string name = parse_key();
parse_chars(" { ");
void TokenParser::Parse(const std::string& input) {
Load(input);
while (ParseWs()) {
std::string name = ParseKey();
ParseChars(" { ");

Token token(name);
while (parse_ws()) {
if (ch == "}") {
parse_char("}");
while (ParseWs()) {
if (ch_ == "}") {
ParseChar("}");
break;
}
std::string key = parse_key();
parse_chars(": \"");
std::string value = parse_value();
parse_char("\"");
token.append(key, value);
std::string key = ParseKey();
ParseChars(": \"");
std::string value = ParseValue();
ParseChar("\"");
token.Append(key, value);
}
tokens.emplace_back(token);
tokens_.emplace_back(token);
}
}

std::string TokenParser::reorder(const std::string& input) {
parse(input);
std::string TokenParser::Reorder(const std::string& input) {
Parse(input);
std::string output = "";
for (auto& token : tokens) {
output += token.string(orders) + " ";
for (auto& token : tokens_) {
output += token.String(orders_) + " ";
}
return trim(output);
return Trim(output);
}

} // namespace wetext
34 changes: 17 additions & 17 deletions runtime/processor/token_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ struct Token {

Token(const std::string& name) : name(name) {}

void append(const std::string& key, const std::string& value) {
void Append(const std::string& key, const std::string& value) {
order.emplace_back(key);
members[key] = value;
}

std::string string(
std::string String(
const std::unordered_map<std::string, std::vector<std::string>>& orders) {
std::string output = name + " {";
if (orders.count(name) > 0) {
Expand All @@ -67,25 +67,25 @@ enum ParseType {
class TokenParser {
public:
TokenParser(ParseType type);
std::string reorder(const std::string& input);
std::string Reorder(const std::string& input);

private:
void load(const std::string& input);
bool read();
bool parse_ws();
bool parse_char(const std::string& exp);
bool parse_chars(const std::string& exp);
std::string parse_key();
std::string parse_value();
void parse(const std::string& input);
void Load(const std::string& input);
bool Read();
bool ParseWs();
bool ParseChar(const std::string& exp);
bool ParseChars(const std::string& exp);
std::string ParseKey();
std::string ParseValue();
void Parse(const std::string& input);

int index;
std::string ch;
std::vector<std::string> text;
std::vector<Token> tokens;
std::unordered_map<std::string, std::vector<std::string>> orders;
int index_;
std::string ch_;
std::vector<std::string> text_;
std::vector<Token> tokens_;
std::unordered_map<std::string, std::vector<std::string>> orders_;
};

} // wetext
} // namespace wetext

#endif // PROCESSOR_TOKEN_PARSER_H_
6 changes: 3 additions & 3 deletions runtime/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ enable_testing()
link_libraries(gtest_main gmock)
include(GoogleTest)

add_executable(utf8_string_test utf8_string_test.cc)
target_link_libraries(utf8_string_test PUBLIC utils)
gtest_discover_tests(utf8_string_test)
add_executable(string_test string_test.cc)
target_link_libraries(string_test PUBLIC utils)
gtest_discover_tests(string_test)

if(NOT MSVC)
# token_parser_test uses the macro to access the private members
Expand Down
Loading