RegexTokenizer implementation

PiperOrigin-RevId: 322275214
tensorflow · Jul 28, 2020 · f06da09 · f06da09
1 parent 45a2c21
commit f06da09
Show file tree

Hide file tree

Showing 8 changed files with 363 additions and 15 deletions.
diff --git a/tensorflow_lite_support/cc/text/tokenizers/BUILD b/tensorflow_lite_support/cc/text/tokenizers/BUILD
@@ -158,6 +158,7 @@ cc_library(
     ],
     deps = [
         ":bert_tokenizer",
+        ":regex_tokenizer",
         ":sentencepiece_tokenizer",
         ":tokenizer",
         "//tensorflow_lite_support/cc:common",
@@ -168,3 +169,20 @@ cc_library(
         "@com_google_absl//absl/status",
     ],
 )
+
+cc_library(
+    name = "regex_tokenizer",
+    srcs = [
+        "regex_tokenizer.cc",
+    ],
+    hdrs = [
+        "regex_tokenizer.h",
+    ],
+    deps = [
+        ":tokenizer",
+        "//tensorflow_lite_support/cc/utils:common_utils",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
+    ],
+)
diff --git a/tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.cc b/tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
+
+#include <iostream>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow_lite_support/cc/utils/common_utils.h"
+namespace tflite::support::text::tokenizer {
+
+namespace {
+constexpr char kStart[] = "<START>";
+constexpr char kPad[] = "<PAD>";
+constexpr char kUnknown[] = "<UNKNOWN>";
+
+void buildIndexTokenMap(
+    const absl::node_hash_map<std::string, int>& token_index_map,
+    absl::node_hash_map<int, absl::string_view>* index_token_map) {
+  for (const auto& [token, index] : token_index_map) {
+    (*index_token_map)[index] = token.data();
+  }
+}
+
+}  // namespace
+
+// RE2::FindAndConsume requires the delim_re_ to have a matching group in order
+// to capture the matched delimiter length. Surround the regex with a
+// parenthesis to create a matching group, it's fine if the regex is already
+// surrounded by parenthesis.
+RegexTokenizer::RegexTokenizer(const std::string& regex_pattern,
+                               const std::string& path_to_vocab)
+    : delim_re_{absl::Substitute("($0)", regex_pattern)},
+      token_index_map_{utils::LoadVocabAndIndexFromFile(path_to_vocab)} {
+  buildIndexTokenMap(token_index_map_, &index_token_map_);
+}
+
+RegexTokenizer::RegexTokenizer(const std::string& regex_pattern,
+                               const char* vocab_buffer_data,
+                               size_t vocab_buffer_size)
+    : delim_re_{absl::Substitute("($0)", regex_pattern)},
+      token_index_map_{utils::LoadVocabAndIndexFromBuffer(vocab_buffer_data,
+                                                          vocab_buffer_size)} {
+  buildIndexTokenMap(token_index_map_, &index_token_map_);
+}
+
+TokenizerResult RegexTokenizer::Tokenize(const std::string& input) {
+  absl::string_view leftover(input.data());
+  absl::string_view last_end = leftover;
+
+  TokenizerResult result;
+
+  // Keep looking for split points until we have reached the end of the input.
+  absl::string_view extracted_delim_token;
+  while (RE2::FindAndConsume(&leftover, delim_re_, &extracted_delim_token)) {
+    absl::string_view token(last_end.data(),
+                            extracted_delim_token.data() - last_end.data());
+    bool has_non_empty_token = token.length() > 0;
+
+    last_end = leftover;
+
+    // Mark the end of the previous token, only if there was something.
+    if (has_non_empty_token) {
+      result.subwords.push_back(std::string(token));
+    }
+  }
+
+  // Close the last token.
+  if (!leftover.empty()) {
+    result.subwords.push_back(std::string(leftover));
+  }
+
+  return result;
+}
+
+bool RegexTokenizer::LookupId(absl::string_view key, int* result) const {
+  auto it = token_index_map_.find(key);
+  if (it == token_index_map_.end()) {
+    return false;
+  }
+  *result = it->second;
+  return true;
+}
+
+bool RegexTokenizer::LookupWord(int vocab_id, absl::string_view* result) const {
+  auto it = index_token_map_.find(vocab_id);
+  if (it == index_token_map_.end()) {
+    return false;
+  }
+  *result = it->second;
+  return true;
+}
+
+bool RegexTokenizer::GetStartToken(int* start_token) {
+  return LookupId(kStart, start_token);
+}
+
+bool RegexTokenizer::GetPadToken(int* pad_token) {
+  return LookupId(kPad, pad_token);
+}
+
+bool RegexTokenizer::GetUnknownToken(int* unknown_token) {
+  return LookupId(kUnknown, unknown_token);
+}
+
+}  // namespace tflite::support::text::tokenizer
diff --git a/tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h b/tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h
@@ -0,0 +1,53 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_SUPPORT_CC_TEXT_TOKENIZERS_REGEX_TOKENIZER_H_
+#define TENSORFLOW_LITE_SUPPORT_CC_TEXT_TOKENIZERS_REGEX_TOKENIZER_H_
+
+#include "absl/container/node_hash_map.h"
+#include "re2/re2.h"
+#include "tensorflow_lite_support/cc/text/tokenizers/tokenizer.h"
+
+namespace tflite::support::text::tokenizer {
+
+// Tokenizer to load a vocabulary and split text by regular expressions.
+class RegexTokenizer : public Tokenizer {
+ public:
+  explicit RegexTokenizer(const std::string& regex_pattern,
+                          const std::string& path_to_vocab);
+
+  explicit RegexTokenizer(const std::string& regex_pattern,
+                          const char* vocab_buffer_data,
+                          size_t vocab_buffer_size);
+
+  TokenizerResult Tokenize(const std::string& input) override;
+
+  bool LookupId(absl::string_view key, int* result) const override;
+
+  bool LookupWord(int vocab_id, absl::string_view* result) const override;
+
+  bool GetStartToken(int* start_token);
+  bool GetPadToken(int* pad_token);
+  bool GetUnknownToken(int* unknown_token);
+
+ private:
+  RE2 delim_re_;
+  absl::node_hash_map<std::string, int> token_index_map_;
+  absl::node_hash_map<int, absl::string_view> index_token_map_;
+};
+
+}  // namespace tflite::support::text::tokenizer
+
+#endif  // TENSORFLOW_LITE_SUPPORT_CC_TEXT_TOKENIZERS_REGEX_TOKENIZER_H_
diff --git a/tensorflow_lite_support/cc/text/tokenizers/tokenizer_utils.cc b/tensorflow_lite_support/cc/text/tokenizers/tokenizer_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow_lite_support/cc/common.h"
 #include "tensorflow_lite_support/cc/port/status_macros.h"
 #include "tensorflow_lite_support/cc/text/tokenizers/bert_tokenizer.h"
+#include "tensorflow_lite_support/cc/text/tokenizers/regex_tokenizer.h"
 #include "tensorflow_lite_support/cc/text/tokenizers/sentencepiece_tokenizer.h"
 #include "tensorflow_lite_support/metadata/metadata_schema_generated.h"
 
@@ -28,6 +29,25 @@ using ::tflite::ProcessUnit;
 using ::tflite::SentencePieceTokenizerOptions;
 using ::tflite::support::CreateStatusWithPayload;
 
+namespace {
+
+StatusOr<absl::string_view> CheckAndLoadFirstAssociatedFile(
+    const flatbuffers::Vector<flatbuffers::Offset<tflite::AssociatedFile>>*
+        associated_files,
+    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
+  if (associated_files == nullptr || associated_files->size() < 1 ||
+      associated_files->Get(0)->name() == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "Invalid vocab_file from input process unit.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+  ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
+                   metadata_extractor->GetAssociatedFile(
+                       associated_files->Get(0)->name()->str()));
+  return vocab_buffer;
+}
+
 StatusOr<std::unique_ptr<Tokenizer>> CreateTokenizerFromMetadata(
     const tflite::metadata::ModelMetadataExtractor& metadata_extractor) {
   const ProcessUnit* tokenizer_process_unit =
@@ -86,4 +106,79 @@ StatusOr<std::unique_ptr<Tokenizer>> CreateTokenizerFromMetadata(
   }
 }
 
+}  // namespace
+StatusOr<std::unique_ptr<Tokenizer>> CreateTokenizerFromProcessUnit(
+    const tflite::ProcessUnit* tokenizer_process_unit,
+    const tflite::metadata::ModelMetadataExtractor* metadata_extractor) {
+  if (metadata_extractor == nullptr || tokenizer_process_unit == nullptr) {
+    return CreateStatusWithPayload(
+        absl::StatusCode::kInvalidArgument,
+        "No metadata or input process unit found.",
+        TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+  switch (tokenizer_process_unit->options_type()) {
+    case ProcessUnitOptions_BertTokenizerOptions: {
+      const tflite::BertTokenizerOptions* options =
+          tokenizer_process_unit->options_as<tflite::BertTokenizerOptions>();
+      ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
+                       CheckAndLoadFirstAssociatedFile(options->vocab_file(),
+                                                       metadata_extractor));
+      return absl::make_unique<BertTokenizer>(vocab_buffer.data(),
+                                              vocab_buffer.size());
+    }
+    case ProcessUnitOptions_SentencePieceTokenizerOptions: {
+      const tflite::SentencePieceTokenizerOptions* options =
+          tokenizer_process_unit->options_as<SentencePieceTokenizerOptions>();
+      ASSIGN_OR_RETURN(absl::string_view model_buffer,
+                       CheckAndLoadFirstAssociatedFile(
+                           options->sentencePiece_model(), metadata_extractor));
+      // TODO(b/160647204): Extract sentence piece model vocabulary
+      return absl::make_unique<SentencePieceTokenizer>(model_buffer.data(),
+                                                       model_buffer.size());
+    }
+    case ProcessUnitOptions_RegexTokenizerOptions: {
+      const tflite::RegexTokenizerOptions* options =
+          tokenizer_process_unit->options_as<RegexTokenizerOptions>();
+      ASSIGN_OR_RETURN(absl::string_view vocab_buffer,
+                       CheckAndLoadFirstAssociatedFile(options->vocab_file(),
+                                                       metadata_extractor));
+      if (options->delim_regex_pattern() == nullptr) {
+        return CreateStatusWithPayload(
+            absl::StatusCode::kInvalidArgument,
+            "Invalid delim_regex_pattern from input process unit.",
+            TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+      }
+
+      std::unique_ptr<RegexTokenizer> regex_tokenizer =
+          absl::make_unique<RegexTokenizer>(
+              options->delim_regex_pattern()->str(), vocab_buffer.data(),
+              vocab_buffer.size());
+
+      int unknown_token_id = 0;
+      if (!regex_tokenizer->GetUnknownToken(&unknown_token_id)) {
+        return CreateStatusWithPayload(
+            absl::StatusCode::kInvalidArgument,
+            "RegexTokenizer doesn't have <UNKNOWN> token.",
+            TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+      }
+
+      int pad_token_id = 0;
+      if (!regex_tokenizer->GetPadToken(&pad_token_id)) {
+        return CreateStatusWithPayload(
+            absl::StatusCode::kInvalidArgument,
+            "RegexTokenizer doesn't have <PAD> token.",
+            TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+      }
+
+      return regex_tokenizer;
+    }
+    default:
+      return CreateStatusWithPayload(
+          absl::StatusCode::kNotFound,
+          absl::StrCat("Incorrect options_type:",
+                       tokenizer_process_unit->options_type()),
+          TfLiteSupportStatus::kMetadataInvalidTokenizerError);
+  }
+}
+
 }  // namespace tflite::support::text::tokenizer
diff --git a/tensorflow_lite_support/cc/text/tokenizers/tokenizer_utils.h b/tensorflow_lite_support/cc/text/tokenizers/tokenizer_utils.h
@@ -29,6 +29,10 @@ inline constexpr int kTokenizerProcessUnitIndex = 0;
 StatusOr<std::unique_ptr<Tokenizer>> CreateTokenizerFromMetadata(
     const tflite::metadata::ModelMetadataExtractor& metadata_extractor);
 
-}  // namespace tflite::support::text::tokenizer
+// Create a Tokenizer from model metadata by extracting
+StatusOr<std::unique_ptr<Tokenizer>> CreateTokenizerFromProcessUnit(
+    const tflite::ProcessUnit* tokenizer_process_unit,
+    const tflite::metadata::ModelMetadataExtractor* metadata_extractor);
 
+}  // namespace tflite::support::text::tokenizer
 #endif  // THIRD_PARTY_TENSORFLOW_LITE_SUPPORT_CC_TEXT_TOKENIZERS_TOKENIZER_UTILS_H_
diff --git a/tensorflow_lite_support/cc/utils/BUILD b/tensorflow_lite_support/cc/utils/BUILD
@@ -25,4 +25,8 @@ cc_library(
     hdrs = [
         "common_utils.h",
     ],
+    deps = [
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
 )