From 054260b40dda4f9f902d9269969d6484a47b19a1 Mon Sep 17 00:00:00 2001
From: Val Markovic <val@markovic.io>
Date: Tue, 9 Feb 2016 13:58:17 -0800
Subject: [PATCH] Tests for ToUnicode and ToBytes + bugfixes

---
 CONTRIBUTING.md                               |   4 +
 ycmd/completers/cs/solutiondetection.py       |   4 +-
 ycmd/completers/general/filename_completer.py |   6 +-
 ycmd/completers/go/gocode_completer.py        |   6 +-
 ycmd/hmac_plugin.py                           |   4 +-
 ycmd/request_wrap.py                          |   4 +-
 ycmd/tests/utils_test.py                      | 131 ++++++++++++++++++
 ycmd/utils.py                                 |  29 +++-
 8 files changed, 171 insertions(+), 17 deletions(-)
 create mode 100644 ycmd/tests/utils_test.py

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index afd840af15..36a4b2a8fa 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -132,6 +132,10 @@ Here's what you should watch out for:
   which will convert `bytes` to a real `str` (again, only on py2). Heed this
   advice for your own sanity; behind it are 40 hours of debugging and an
   instance of tears-down-the-cheek crying at 2 am.
+- **Use the `ToBytes()` and `ToUnicode()` helper functions from
+  `ycmd/utils.py`.** They work around weirdness, complexity and bugs in
+  `python-future` and behave as you would expect. They're also extensively
+  covered with tests.
 - Use `from future.utils import iteritems`
   then `for key, value in iteritems( dict_obj )` to efficiently iterate dicts on
   py2 & py3
diff --git a/ycmd/completers/cs/solutiondetection.py b/ycmd/completers/cs/solutiondetection.py
index 4c8c802010..b8d7d7e543 100644
--- a/ycmd/completers/cs/solutiondetection.py
+++ b/ycmd/completers/cs/solutiondetection.py
@@ -29,7 +29,7 @@
 import logging
 from inspect import getfile
 from ycmd import extra_conf_store
-from ycmd.utils import ToUnicodeIfNeeded
+from ycmd.utils import ToUnicode
 
 
 __logger = logging.getLogger( __name__ )
@@ -58,7 +58,7 @@ def PollModule( module, filepath ):
     try:
       module_hint = module.CSharpSolutionFile( filepath )
       __logger.info( u'extra_conf_store suggests {0} as solution file'.format(
-        ToUnicodeIfNeeded( module_hint ) ) )
+        ToUnicode( module_hint ) ) )
       if module_hint:
         # received a full path or one relative to the config's location?
         candidates = [ module_hint,
diff --git a/ycmd/completers/general/filename_completer.py b/ycmd/completers/general/filename_completer.py
index 6d55b64deb..e4986e07f1 100644
--- a/ycmd/completers/general/filename_completer.py
+++ b/ycmd/completers/general/filename_completer.py
@@ -33,7 +33,7 @@
                                               GetIncludeStatementValue )
 from ycmd.completers.cpp.clang_completer import InCFamilyFile
 from ycmd.completers.cpp.flags import Flags
-from ycmd.utils import ToUnicodeIfNeeded, OnWindows
+from ycmd.utils import ToUnicode, OnWindows
 from ycmd import responses
 
 EXTRA_INFO_MAP = { 1 : '[File]', 2 : '[Dir]', 3 : '[File&Dir]' }
@@ -145,7 +145,7 @@ def GetPathsIncludeCase( self, path_dir, quoted_include, filepath,
       include_paths.extend( quoted_include_paths )
 
     for include_path in include_paths:
-      unicode_path = ToUnicodeIfNeeded( os.path.join( include_path, path_dir ) )
+      unicode_path = ToUnicode( os.path.join( include_path, path_dir ) )
       try:
         # We need to pass a unicode string to get unicode strings out of
         # listdir.
@@ -194,7 +194,7 @@ def _GetPathsStandardCase( path_dir, use_working_dir, filepath, working_dir ):
   try:
     # We need to pass a unicode string to get unicode strings out of
     # listdir.
-    relative_paths = os.listdir( ToUnicodeIfNeeded( absolute_path_dir ) )
+    relative_paths = os.listdir( ToUnicode( absolute_path_dir ) )
   except:
     relative_paths = []
 
diff --git a/ycmd/completers/go/gocode_completer.py b/ycmd/completers/go/gocode_completer.py
index 1e72e55b60..df9fe3c1a0 100755
--- a/ycmd/completers/go/gocode_completer.py
+++ b/ycmd/completers/go/gocode_completer.py
@@ -30,7 +30,7 @@
 
 from ycmd import responses
 from ycmd import utils
-from ycmd.utils import ToBytes, ToUnicodeIfNeeded
+from ycmd.utils import ToBytes, ToUnicode
 from ycmd.completers.completer import Completer
 
 GO_FILETYPES = set( [ 'go' ] )
@@ -99,7 +99,7 @@ def ComputeCandidatesInner( self, request_data ):
                                       contents = contents )
 
     try:
-      resultdata = json.loads( ToUnicodeIfNeeded( stdoutdata ) )
+      resultdata = json.loads( ToUnicode( stdoutdata ) )
     except ValueError:
       _logger.error( PARSE_ERROR_MESSAGE )
       raise RuntimeError( PARSE_ERROR_MESSAGE )
@@ -217,7 +217,7 @@ def _GoToDefinition( self, request_data ):
 
 
   def _ConstructGoToFromResponse( self, response_str ):
-    parsed = json.loads( ToUnicodeIfNeeded( response_str ) )
+    parsed = json.loads( ToUnicode( response_str ) )
     if 'filename' in parsed and 'column' in parsed:
       return responses.BuildGoToResponse( parsed[ 'filename' ],
                                           int( parsed[ 'line' ] ),
diff --git a/ycmd/hmac_plugin.py b/ycmd/hmac_plugin.py
index 218dbf9d72..431bb5aa52 100644
--- a/ycmd/hmac_plugin.py
+++ b/ycmd/hmac_plugin.py
@@ -29,7 +29,7 @@
 from base64 import b64decode, b64encode
 from bottle import request, response, abort
 from ycmd import hmac_utils
-from ycmd.utils import ToBytes, ToUnicodeIfNeeded
+from ycmd.utils import ToBytes, ToUnicode
 
 _HMAC_HEADER = 'x-ycm-hmac'
 _HOST_HEADER = 'host'
@@ -91,5 +91,5 @@ def RequestAuthenticated( method, path, body, hmac_secret ):
 
 
 def SetHmacHeader( body, hmac_secret ):
-  response.headers[ _HMAC_HEADER ] = ToUnicodeIfNeeded( b64encode(
+  response.headers[ _HMAC_HEADER ] = ToUnicode( b64encode(
       hmac_utils.CreateHmac( ToBytes( body ), ToBytes( hmac_secret ) ) ) )
diff --git a/ycmd/request_wrap.py b/ycmd/request_wrap.py
index 55c5136a1e..1fdfb92fbd 100644
--- a/ycmd/request_wrap.py
+++ b/ycmd/request_wrap.py
@@ -23,7 +23,7 @@
 standard_library.install_aliases()
 from builtins import *  # noqa
 
-from ycmd.utils import ToUnicodeIfNeeded, ToUtf8IfNeeded
+from ycmd.utils import ToUnicode, ToUtf8IfNeeded
 from ycmd.identifier_utils import StartOfLongestIdentifierEndingAtIndex
 from ycmd.request_validation import EnsureRequestValid
 
@@ -104,7 +104,7 @@ def CompletionStartColumn( line_value, column_num, filetype ):
   # to walk codepoints for identifier checks.
 
   utf8_line_value = ToUtf8IfNeeded( line_value )
-  unicode_line_value = ToUnicodeIfNeeded( line_value )
+  unicode_line_value = ToUnicode( line_value )
   codepoint_column_num = len(
       str( utf8_line_value[ : column_num -1 ], 'utf8' ) ) + 1
 
diff --git a/ycmd/tests/utils_test.py b/ycmd/tests/utils_test.py
new file mode 100644
index 0000000000..f7e659268d
--- /dev/null
+++ b/ycmd/tests/utils_test.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2016  ycmd contributors.
+#
+# This file is part of YouCompleteMe.
+#
+# YouCompleteMe is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# YouCompleteMe is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with YouCompleteMe.  If not, see <http://www.gnu.org/licenses/>.
+
+# Intentionally not importing unicode_literals!
+from __future__ import division
+from __future__ import print_function
+from __future__ import absolute_import
+from future import standard_library
+standard_library.install_aliases()
+from builtins import *  # noqa
+from future.utils import PY2
+
+from nose.tools import eq_, ok_
+from ycmd import utils
+
+# NOTE: isinstance() vs type() is carefully used in this test file. Before
+# changing things here, read the comments in utils.ToBytes.
+
+
+if PY2:
+  def ToBytes_Py2Bytes_test():
+    value = utils.ToBytes( bytes( 'abc' ) )
+    eq_( value, bytes( 'abc' ) )
+    eq_( type( value ), bytes )
+
+
+  def ToBytes_Py2Str_test():
+    value = utils.ToBytes( 'abc' )
+    eq_( value, bytes( 'abc' ) )
+    eq_( type( value ), bytes )
+
+
+  def ToBytes_Py2FutureStr_test():
+    value = utils.ToBytes( str( 'abc' ) )
+    eq_( value, bytes( 'abc' ) )
+    eq_( type( value ), bytes )
+
+
+  def ToBytes_Py2Unicode_test():
+    value = utils.ToBytes( u'abc' )
+    eq_( value, bytes( 'abc' ) )
+    eq_( type( value ), bytes )
+
+
+  def ToBytes_Py2Int_test():
+    value = utils.ToBytes( 123 )
+    eq_( value, bytes( '123' ) )
+    eq_( type( value ), bytes )
+
+
+def ToBytes_Bytes_test():
+  value = utils.ToBytes( bytes( b'abc' ) )
+  eq_( value, bytes( b'abc' ) )
+  eq_( type( value ), bytes )
+
+
+def ToBytes_Str_test():
+  value = utils.ToBytes( u'abc' )
+  eq_( value, bytes( b'abc' ) )
+  eq_( type( value ), bytes )
+
+
+def ToBytes_Int_test():
+  value = utils.ToBytes( 123 )
+  eq_( value, bytes( b'123' ) )
+  eq_( type( value ), bytes )
+
+
+if PY2:
+  def ToUnicode_Py2Bytes_test():
+    value = utils.ToUnicode( bytes( 'abc' ) )
+    eq_( value, u'abc' )
+    ok_( isinstance( value, str ) )
+
+
+  def ToUnicode_Py2Str_test():
+    value = utils.ToUnicode( 'abc' )
+    eq_( value, u'abc' )
+    ok_( isinstance( value, str ) )
+
+
+  def ToUnicode_Py2FutureStr_test():
+    value = utils.ToUnicode( str( 'abc' ) )
+    eq_( value, u'abc' )
+    ok_( isinstance( value, str ) )
+
+
+  def ToUnicode_Py2Unicode_test():
+    value = utils.ToUnicode( u'abc' )
+    eq_( value, u'abc' )
+    ok_( isinstance( value, str ) )
+
+
+  def ToUnicode_Py2Int_test():
+    value = utils.ToUnicode( 123 )
+    eq_( value, u'123' )
+    ok_( isinstance( value, str ) )
+
+
+def ToUnicode_Bytes_test():
+  value = utils.ToUnicode( bytes( b'abc' ) )
+  eq_( value, u'abc' )
+  ok_( isinstance( value, str ) )
+
+
+def ToUnicode_Str_test():
+  value = utils.ToUnicode( u'abc' )
+  eq_( value, u'abc' )
+  ok_( isinstance( value, str ) )
+
+
+def ToUnicode_Int_test():
+  value = utils.ToUnicode( 123 )
+  eq_( value, u'123' )
+  ok_( isinstance( value, str ) )
diff --git a/ycmd/utils.py b/ycmd/utils.py
index 4df93bd410..91e3c99d1f 100644
--- a/ycmd/utils.py
+++ b/ycmd/utils.py
@@ -65,7 +65,9 @@ def ToUtf8IfNeeded( value ):
   return bytes( str( value ) )
 
 
-def ToUnicodeIfNeeded( value ):
+# Returns a unicode type; either the new python-future str type or the real
+# unicode type. The difference shouldn't matter.
+def ToUnicode( value ):
   if isinstance( value, str ):
     return value
   if isinstance( value, bytes ):
@@ -74,6 +76,8 @@ def ToUnicodeIfNeeded( value ):
   return str( value )
 
 
+# Consistently returns the new bytes() type from python-future. Assumes incoming
+# strings are either UTF-8 or unicode (which is converted to UTF-8).
 def ToBytes( value ):
   # This is tricky. On py2, the bytes type from builtins (from python-future) is
   # a subclass of str. So all of the following are true:
@@ -81,17 +85,32 @@ def ToBytes( value ):
   #   isinstance(bytes(), str)
   # But they don't behave the same in one important aspect: iterating over a
   # bytes instance yields ints, while iterating over a (raw, py2) str yields
-  # chars.
+  # chars. We want consistent behavior so we force the use of bytes().
   if type( value ) == bytes:
     return value
 
   # This is meant to catch Python 2's str type and the str on Python 3, which is
   # unicode.
-  if isinstance( value, bytes ) or isinstance( value, str ):
-    return bytes( value, encoding = 'utf-8' )
+  if isinstance( value, bytes ):
+    return bytes( value, encoding = 'utf8' )
+
+  if isinstance( value, str ):
+    # On py2, with `from builtins import *` imported, the following is true:
+    #
+    #   bytes(str(u'abc'), 'utf8') == b"b'abc'"
+    #
+    # Obviously this is a bug in python-future. So we work around it. Also filed
+    # upstream at: https://github.com/PythonCharmers/python-future/issues/193
+    # We can't just return value.encode( 'utf8' ) on both py2 & py3 because on
+    # py2 that returns the built-in str type instead of the newbytes type from
+    # python-future.
+    if PY2:
+      return bytes( value.encode( 'utf8' ), encoding = 'utf8' )
+    else:
+      return bytes( value, encoding = 'utf8' )
 
   # This is meant to catch `int` and similar non-string/bytes types.
-  return bytes( str( value ), encoding = 'utf-8' )
+  return ToBytes( str( value ) )
 
 
 def PathToTempDir():