From b1ad6f076c734f9b4823af7f05f1ca0d643afd83 Mon Sep 17 00:00:00 2001 From: Deasuke Date: Wed, 12 Jul 2017 06:34:41 +0000 Subject: [PATCH 1/2] replace unicode with six.text_type --- tensor2tensor/data_generators/text_encoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 5d628fa4a..4184a7e87 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -473,7 +473,7 @@ def _escape_token(self, token): Returns: escaped_token: a unicode string """ - assert isinstance(token, unicode) + assert isinstance(token, six.text_type) token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_" ret = u"" for c in token: From 89ddffe054f25b5cea3d4c6d543562d51d511099 Mon Sep 17 00:00:00 2001 From: Deasuke Date: Wed, 12 Jul 2017 07:13:50 +0000 Subject: [PATCH 2/2] use decode instead of unicode in PY3 --- tensor2tensor/data_generators/wiki.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 5ccbf14d9..99a9e64e6 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -25,6 +25,7 @@ # Dependency imports import six +from six import PY2 from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import tokenizer @@ -60,7 +61,7 @@ def page_generator(tmp_dir, max_docs=None): count = 0 corpus_filepath = _maybe_download_corpus(tmp_dir) for line in bz2.BZ2File(corpus_filepath, "r"): - line = unicode(line, "utf-8") + line = unicode(line, "utf-8") if PY2 else line.decode("utf-8") if not doc and line != u" \n": continue doc += line