diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index 5d628fa4a..4184a7e87 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -473,7 +473,7 @@ def _escape_token(self, token): Returns: escaped_token: a unicode string """ - assert isinstance(token, unicode) + assert isinstance(token, six.text_type) token = token.replace(u"\\", u"\\\\").replace(u"_", u"\\u") + u"_" ret = u"" for c in token: diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 5ccbf14d9..99a9e64e6 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -25,6 +25,7 @@ # Dependency imports import six +from six import PY2 from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import tokenizer @@ -60,7 +61,7 @@ def page_generator(tmp_dir, max_docs=None): count = 0 corpus_filepath = _maybe_download_corpus(tmp_dir) for line in bz2.BZ2File(corpus_filepath, "r"): - line = unicode(line, "utf-8") + line = unicode(line, "utf-8") if PY2 else line.decode("utf-8") if not doc and line != u" \n": continue doc += line