Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add the RegexTokenizer to the text classifier. #41

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions tensorflow_lite_support/metadata/cc/metadata_version.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ enum class SchemaMembers {
kProcessUnitOptionsSentencePieceTokenizerOptions = 4,
kSubGraphMetadataInputTensorGroups = 5,
kSubGraphMetadataOutputTensorGroups = 6,
kProcessUnitOptionsRegexTokenizerOptions = 7,
};

// Helper class to compare semantic versions in terms of three integers, major,
Expand Down Expand Up @@ -104,6 +105,8 @@ Version GetMemberVersion(SchemaMembers member) {
return Version(1, 2, 0);
case SchemaMembers::kSubGraphMetadataOutputTensorGroups:
return Version(1, 2, 0);
case SchemaMembers::kProcessUnitOptionsRegexTokenizerOptions:
return Version(1, 2, 1);
default:
// Should never happen.
TFLITE_LOG(FATAL) << "Unsupported schema member: "
Expand Down Expand Up @@ -165,6 +168,12 @@ void UpdateMinimumVersionForTable<tflite::ProcessUnit>(
SchemaMembers::kProcessUnitOptionsSentencePieceTokenizerOptions),
min_version);
}
if (process_unit_type == ProcessUnitOptions_RegexTokenizerOptions) {
UpdateMinimumVersion(
GetMemberVersion(
SchemaMembers::kProcessUnitOptionsRegexTokenizerOptions),
min_version);
}
}

template <>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public final class MetadataParser {
* The version of the metadata parser that this metadata extractor library is depending on. The
* value should match the value of "Schema Semantic version" in metadata_schema.fbs.
*/
public static final String VERSION = "1.2.0";
public static final String VERSION = "1.2.1";

private MetadataParser() {}
}
20 changes: 18 additions & 2 deletions tensorflow_lite_support/metadata/metadata_schema.fbs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ namespace tflite;
// for which they were added.
//
// LINT.IfChange
// Schema Semantic version: 1.2.0
// Schema Semantic version: 1.2.1
// LINT.ThenChange(//tensorflow_lite_support/\
// metadata/java/src/java/org/tensorflow/lite/support/metadata/\
// MetadataParser.java)
Expand All @@ -68,6 +68,7 @@ file_identifier "M001";
// Added output_process_units to SubGraphMetadata.
// 1.2.0 - Added input_tensor_group to SubGraphMetadata.
// Added output_tensor_group to SubGraphMetadata.
// 1.2.1 - Added RegexTokenizerOptions to ProcessUnitOptions.

// File extension of any written files.
file_extension "tflitemeta";
Expand Down Expand Up @@ -448,6 +449,19 @@ table SentencePieceTokenizerOptions {
vocab_file:[AssociatedFile];
}

// Splits strings by the occurrences of pattern and converts the tokens into
// ids. For example, given
// pattern: "\W+",
// string: "Words, words, words.",
// the tokens after split are: "Words", "words", "words", "".
// And then the tokens can be converted into ids according to the vocab_file.
// Added in: 1.2.1
table RegexTokenizerOptions {
pattern:string;
// The vocabulary files used to convert this tokens into ids.
vocab_file:[AssociatedFile];
}

// Options that are used when processing the tensor.
union ProcessUnitOptions {
NormalizationOptions,
Expand All @@ -456,7 +470,9 @@ union ProcessUnitOptions {
// Added in: 1.1.0
BertTokenizerOptions,
// Added in: 1.1.0
SentencePieceTokenizerOptions
SentencePieceTokenizerOptions,
// Added in: 1.2.1
RegexTokenizerOptions
}

// A process unit that is used to process the tensor out-of-graph.
Expand Down