Permalink
Browse files

New transition systems and features for syntaxnet (#301)

* Morpher and segmenter transition systems and new features (quotes, punctuation, capitalization, character ngrams, morphology attributes).
  • Loading branch information...
1 parent a591478 commit 64675fc72f7d6184baa983d5659f0e08f2652b3f @calberti calberti committed on GitHub Aug 4, 2016
Showing with 4,257 additions and 62 deletions.
  1. +3 −2 syntaxnet/README.md
  2. +105 −23 syntaxnet/syntaxnet/BUILD
  3. +102 −0 syntaxnet/syntaxnet/binary_segment_state.cc
  4. +99 −0 syntaxnet/syntaxnet/binary_segment_state.h
  5. +218 −0 syntaxnet/syntaxnet/binary_segment_state_test.cc
  6. +121 −0 syntaxnet/syntaxnet/binary_segment_transitions.cc
  7. +111 −0 syntaxnet/syntaxnet/binary_segment_transitions_test.cc
  8. +845 −0 syntaxnet/syntaxnet/char_properties.cc
  9. +362 −0 syntaxnet/syntaxnet/char_properties.h
  10. +364 −0 syntaxnet/syntaxnet/char_properties_test.cc
  11. +4 −2 syntaxnet/syntaxnet/document_filters.cc
  12. +2 −0 syntaxnet/syntaxnet/document_format.h
  13. +12 −1 syntaxnet/syntaxnet/lexicon_builder.cc
  14. +25 −1 syntaxnet/syntaxnet/lexicon_builder_test.py
  15. +298 −0 syntaxnet/syntaxnet/morpher_transitions.cc
  16. +91 −0 syntaxnet/syntaxnet/morphology_label_set.cc
  17. +110 −0 syntaxnet/syntaxnet/morphology_label_set.h
  18. +101 −0 syntaxnet/syntaxnet/morphology_label_set_test.cc
  19. +0 −1 syntaxnet/syntaxnet/parser_eval.py
  20. +18 −0 syntaxnet/syntaxnet/parser_features.cc
  21. +4 −2 syntaxnet/syntaxnet/proto_io.h
  22. +85 −0 syntaxnet/syntaxnet/segmenter_utils.cc
  23. +93 −0 syntaxnet/syntaxnet/segmenter_utils.h
  24. +149 −0 syntaxnet/syntaxnet/segmenter_utils_test.cc
  25. +15 −0 syntaxnet/syntaxnet/sentence.proto
  26. +1 −1 syntaxnet/syntaxnet/sentence_batch.cc
  27. +233 −3 syntaxnet/syntaxnet/sentence_features.cc
  28. +329 −5 syntaxnet/syntaxnet/sentence_features.h
  29. +123 −4 syntaxnet/syntaxnet/sentence_features_test.cc
  30. +40 −5 syntaxnet/syntaxnet/tagger_transitions.cc
  31. +6 −0 syntaxnet/syntaxnet/testdata/context.pbtxt
  32. +107 −7 syntaxnet/syntaxnet/text_formats.cc
  33. +59 −1 syntaxnet/syntaxnet/utils.h
  34. +2 −0 syntaxnet/syntaxnet/workspace.h
  35. +6 −0 syntaxnet/util/utf8/unicodetext.h
  36. +0 −4 syntaxnet/util/utf8/unicodetext_unittest.cc
  37. +14 −0 syntaxnet/util/utf8/unilib_utf8_utils.h
View
@@ -107,8 +107,8 @@ Bazel should complete reporting all tests passed.
You can also compile SyntaxNet in a [Docker](https://www.docker.com/what-docker)
container using this [Dockerfile](Dockerfile).
-**Note:** If you are running Docker on OSX, make sure that you have enough memory allocated
-for your Docker VM.
+**Note:** If you are running Docker on OSX, make sure that you have enough
+memory allocated for your Docker VM.
## Getting Started
@@ -612,6 +612,7 @@ Original authors of the code in this package include (in alphabetical order):
* David Weiss
* Emily Pitler
* Greg Coppola
+* Ji Ma
* Keith Hall
* Kuzman Ganchev
* Michael Collins
View
@@ -159,6 +159,31 @@ cc_library(
)
cc_library(
+ name = "char_properties",
+ srcs = ["char_properties.cc"],
+ hdrs = ["char_properties.h"],
+ deps = [
+ ":registry",
+ ":utils",
+ "//util/utf8:unicodetext",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
+ name = "segmenter_utils",
+ srcs = ["segmenter_utils.cc"],
+ hdrs = ["segmenter_utils.h"],
+ deps = [
+ ":base",
+ ":char_properties",
+ ":sentence_proto",
+ "//util/utf8:unicodetext",
+ ],
+ alwayslink = 1,
+)
+
+cc_library(
name = "feature_extractor",
srcs = ["feature_extractor.cc"],
hdrs = [
@@ -199,6 +224,7 @@ cc_library(
":affix",
":feature_extractor",
":registry",
+ ":segmenter_utils",
],
)
@@ -251,24 +277,50 @@ cc_library(
)
cc_library(
+ name = "morphology_label_set",
+ srcs = ["morphology_label_set.cc"],
+ hdrs = ["morphology_label_set.h"],
+ deps = [
+ ":document_format",
+ ":feature_extractor",
+ ":proto_io",
+ ":registry",
+ ":sentence_proto",
+ ":utils",
+ ],
+)
+
+cc_library(
name = "parser_transitions",
srcs = [
"arc_standard_transitions.cc",
+ "binary_segment_state.cc",
+ "binary_segment_transitions.cc",
+ "morpher_transitions.cc",
+ "parser_features.cc",
"parser_state.cc",
"parser_transitions.cc",
"tagger_transitions.cc",
],
hdrs = [
+ "binary_segment_state.h",
+ "parser_features.h",
"parser_state.h",
"parser_transitions.h",
],
deps = [
+ ":affix",
+ ":feature_extractor",
":kbest_syntax_proto",
+ ":morphology_label_set",
":registry",
+ ":segmenter_utils",
+ ":sentence_features",
":sentence_proto",
":shared_store",
":task_context",
":term_frequency_map",
+ ":workspace",
],
alwayslink = 1,
)
@@ -289,29 +341,11 @@ cc_library(
)
cc_library(
- name = "parser_features",
- srcs = ["parser_features.cc"],
- hdrs = ["parser_features.h"],
- deps = [
- ":affix",
- ":feature_extractor",
- ":parser_transitions",
- ":registry",
- ":sentence_features",
- ":task_context",
- ":term_frequency_map",
- ":workspace",
- ],
- alwayslink = 1,
-)
-
-cc_library(
name = "embedding_feature_extractor",
srcs = ["embedding_feature_extractor.cc"],
hdrs = ["embedding_feature_extractor.h"],
deps = [
":feature_extractor",
- ":parser_features",
":parser_transitions",
":sparse_proto",
":task_context",
@@ -326,7 +360,6 @@ cc_library(
deps = [
":embedding_feature_extractor",
":feature_extractor",
- ":parser_features",
":parser_transitions",
":sentence_proto",
":sparse_proto",
@@ -344,7 +377,6 @@ cc_library(
"reader_ops.cc",
],
deps = [
- ":parser_features",
":parser_transitions",
":sentence_batch",
":sentence_proto",
@@ -360,7 +392,6 @@ cc_library(
srcs = ["document_filters.cc"],
deps = [
":document_format",
- ":parser_features",
":parser_transitions",
":sentence_batch",
":sentence_proto",
@@ -376,8 +407,8 @@ cc_library(
deps = [
":dictionary_proto",
":document_format",
- ":parser_features",
":parser_transitions",
+ ":segmenter_utils",
":sentence_batch",
":sentence_proto",
":task_context",
@@ -439,6 +470,18 @@ filegroup(
)
cc_test(
+ name = "binary_segment_state_test",
+ size = "small",
+ srcs = ["binary_segment_state_test.cc"],
+ deps = [
+ ":base",
+ ":parser_transitions",
+ ":term_frequency_map",
+ ":test_main",
+ ],
+)
+
+cc_test(
name = "shared_store_test",
size = "small",
srcs = ["shared_store_test.cc"],
@@ -449,6 +492,26 @@ cc_test(
)
cc_test(
+ name = "char_properties_test",
+ srcs = ["char_properties_test.cc"],
+ deps = [
+ ":char_properties",
+ ":test_main",
+ ],
+)
+
+cc_test(
+ name = "segmenter_utils_test",
+ srcs = ["segmenter_utils_test.cc"],
+ deps = [
+ ":base",
+ ":segmenter_utils",
+ ":sentence_proto",
+ ":test_main",
+ ],
+)
+
+cc_test(
name = "sentence_features_test",
size = "medium",
srcs = ["sentence_features_test.cc"],
@@ -466,6 +529,15 @@ cc_test(
)
cc_test(
+ name = "morphology_label_set_test",
+ srcs = ["morphology_label_set_test.cc"],
+ deps = [
+ ":morphology_label_set",
+ ":test_main",
+ ],
+)
+
+cc_test(
name = "arc_standard_transitions_test",
size = "small",
srcs = ["arc_standard_transitions_test.cc"],
@@ -480,6 +552,17 @@ cc_test(
)
cc_test(
+ name = "binary_segment_transitions_test",
+ size = "small",
+ srcs = ["binary_segment_transitions_test.cc"],
+ deps = [
+ ":parser_transitions",
+ ":sentence_proto",
+ ":test_main",
+ ],
+)
+
+cc_test(
name = "tagger_transitions_test",
size = "small",
srcs = ["tagger_transitions_test.cc"],
@@ -499,7 +582,6 @@ cc_test(
srcs = ["parser_features_test.cc"],
deps = [
":feature_extractor",
- ":parser_features",
":parser_transitions",
":populate_test_inputs",
":sentence_proto",
@@ -0,0 +1,102 @@
+/* Copyright 2016 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "syntaxnet/binary_segment_state.h"
+
+#include <string>
+#include "syntaxnet/segmenter_utils.h"
+#include "syntaxnet/sentence.pb.h"
+
+namespace syntaxnet {
+
+ParserTransitionState *BinarySegmentState::Clone() const {
+ return new BinarySegmentState();
+}
+
+string BinarySegmentState::ToString(const ParserState &state) const {
+ string str("[");
+ for (int i = NumStarts(state) - 1; i >=0; --i) {
+ int start = LastStart(i, state);
+ int end = 0;
+ if (i - 1 >= 0) {
+ end = LastStart(i - 1, state) - 1;
+ } else if (state.EndOfInput()) {
+ end = state.sentence().token_size() - 1;
+ } else {
+ end = state.Next() - 1;
+ }
+ for (int k = start; k <= end; ++k) {
+ str.append(state.GetToken(k).word());
+ }
+ if (i >= 1) str.append(" ");
+ }
+
+ str.append("] ");
+ for (int i = state.Next(); i < state.NumTokens(); ++i) {
+ str.append(state.GetToken(i).word());
+ }
+ return str;
+}
+
+void BinarySegmentState::AddParseToDocument(const ParserState &state,
+ bool rewrite_root_labels,
+ Sentence *sentence) const {
+ if (sentence->token_size() == 0) return;
+ vector<bool> is_starts(sentence->token_size(), false);
+ for (int i = 0; i < NumStarts(state); ++i) {
+ is_starts[LastStart(i, state)] = true;
+ }
+
+ // Break level of the current token is determined based on its previous token.
+ Token::BreakLevel break_level = Token::NO_BREAK;
+ bool is_first_token = true;
+ Sentence new_sentence;
+ for (int i = 0; i < sentence->token_size(); ++i) {
+ const Token &token = sentence->token(i);
+ const string &word = token.word();
+ bool is_break = SegmenterUtils::IsBreakChar(word);
+ if (is_starts[i] || is_first_token) {
+ if (!is_break) {
+ // The current character is the first char of a new token/word.
+ Token *new_token = new_sentence.add_token();
+ new_token->set_start(token.start());
+ new_token->set_end(token.end());
+ new_token->set_word(word);
+
+ // For the first token, keep the old break level to make sure that the
+ // number of sentences stays unchanged.
+ new_token->set_break_level(break_level);
+ is_first_token = false;
+ }
+ } else {
+ // Append the character to the previous token.
+ if (!is_break) {
+ int index = new_sentence.token_size() - 1;
+ auto *last_token = new_sentence.mutable_token(index);
+ last_token->mutable_word()->append(word);
+ last_token->set_end(token.end());
+ }
+ }
+
+ // Update break level. Note we do not introduce new sentences in the
+ // transition system, thus anything goes beyond line break would be reduced
+ // to line break.
+ break_level = is_break ? SegmenterUtils::BreakLevel(word) : Token::NO_BREAK;
+ if (break_level >= Token::LINE_BREAK) break_level = Token::LINE_BREAK;
+ }
+ sentence->mutable_token()->Swap(new_sentence.mutable_token());
+}
+
+} // namespace syntaxnet
Oops, something went wrong.

0 comments on commit 64675fc

Please sign in to comment.