Avoided raw unicode strings for Python 3 compatibility

Raw unicode strings syntax (e.g. ur'mystring') is not valid syntax on Python 3. Read https://www.python.org/dev/peps/pep-0414/#exclusion-of-raw-unicode-literals This patch uses various techniques to workaround this issue: removing either the raw or unicode markers when they are not essential, or using unicode_literals to mimic the Python 3 behavior.
translate · Aug 19, 2015 · 1880265 · 1880265
1 parent 1cd3097
commit 1880265
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 75 deletions.
diff --git a/translate/convert/test_po2ical.py b/translate/convert/test_po2ical.py
@@ -49,7 +49,7 @@ def merge2ical(self, propsource, posource):
 
     def test_simple_summary(self):
         """test that we output correctly for Inno files."""
-        posource = ur'''#: [uid1@example.com]SUMMARY
+        posource = u'''#: [uid1@example.com]SUMMARY
 msgid "Value"
 msgstr "Waarde"
 '''

diff --git a/translate/convert/test_po2ini.py b/translate/convert/test_po2ini.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+from __future__ import unicode_literals
+
 from pytest import importorskip
 
 from translate.convert import po2ini, test_convert
@@ -95,13 +97,13 @@ def test_empty_value(self):
 
     def test_dialects_inno(self):
         """test that we output correctly for Inno files."""
-        posource = ur'''#: [section]prop
+        posource = r'''#: [section]prop
 msgid "value\tvalue2\n"
 msgstr "ṽḁḽṻḝ\tṽḁḽṻḝ2\n"
 '''
         initemplate = '''[section]\nprop  =  value%tvalue%n\n'''
         iniexpected = '''[section]\nprop  =  ṽḁḽṻḝ%tṽḁḽṻḝ2%n\n'''
-        inifile = self.merge2ini(initemplate, posource, "inno")
+        inifile = self.merge2ini(initemplate, posource, "inno").decode('utf-8')
         print(inifile)
         assert inifile == iniexpected
 

diff --git a/translate/lang/common.py b/translate/lang/common.py
@@ -60,6 +60,8 @@
   - phrases
 """
 
+from __future__ import unicode_literals
+
 import logging
 import re
 import six
@@ -120,58 +122,58 @@ class Common(object):
     """This of languages that has different plural formula in Mozilla than the
     standard one in Gettext."""
 
-    listseperator = u", "
+    listseperator = ", "
     """This string is used to separate lists of textual elements. Most
     languages probably can stick with the default comma, but Arabic and some
     Asian languages might want to override this."""
 
-    specialchars = u""
+    specialchars = ""
     """Characters used by the language that might not be easy to input with
     common keyboard layouts"""
 
-    commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
+    commonpunc = ".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
     """These punctuation marks are common in English and most languages that
     use latin script."""
 
-    quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
+    quotes = "‘’‛“”„‟′″‴‵‶‷‹›«»"
     """These are different quotation marks used by various languages."""
 
-    invertedpunc = u"¿¡"
+    invertedpunc = "¿¡"
     """Inverted punctuation sometimes used at the beginning of sentences in
     Spanish, Asturian, Galician, and Catalan."""
 
-    rtlpunc = u"،؟؛÷"
+    rtlpunc = "،؟؛÷"
     """These punctuation marks are used by Arabic and Persian, for example."""
 
-    CJKpunc = u"。、，；！？「」『』【】"
+    CJKpunc = "。、，；！？「」『』【】"
     """These punctuation marks are used in certain circumstances with CJK
     languages."""
 
-    indicpunc = u"।॥॰"
+    indicpunc = "।॥॰"
     """These punctuation marks are used by several Indic languages."""
 
-    ethiopicpunc = u"።፤፣"
+    ethiopicpunc = "።፤፣"
     """These punctuation marks are used by several Ethiopic languages."""
 
-    miscpunc = u"…±°¹²³·©®×£¥€"
+    miscpunc = "…±°¹²³·©®×£¥€"
     """The middle dot (·) is used by Greek and Georgian."""
 
-    punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,
-                            indicpunc, ethiopicpunc, miscpunc])
+    punctuation = "".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,
+                           indicpunc, ethiopicpunc, miscpunc])
     """We include many types of punctuation here, simply since this is only
     meant to determine if something is punctuation. Hopefully we catch some
     languages which might not be represented with modules. Most languages won't
     need to override this."""
 
-    sentenceend = u".!?…։؟।。！？።\u06d4"
+    sentenceend = ".!?…։؟।。！？።\u06d4"
     """These marks can indicate a sentence end. Once again we try to account
     for many languages. Most langauges won't need to override this."""
 
     #The following tries to account for a lot of things. For the best idea of
     #what works, see test_common.py. We try to ignore abbreviations, for
     #example, by checking that the following sentence doesn't start with lower
     #case or numbers.
-    sentencere = re.compile(ur"""
+    sentencere = re.compile(r"""
         (?s)        # make . also match newlines
         .*?         # anything, but match non-greedy
         [%s]        # the puntuation for sentence ending
@@ -243,24 +245,24 @@ def punctranslate(cls, text):
         #TODO: look at po::escapeforpo() for performance idea
         if not text:
             return text
-        ellipses_end = text.endswith(u"...")
+        ellipses_end = text.endswith("...")
         if ellipses_end:
             text = text[:-3]
         for source, target in six.iteritems(cls.puncdict):
             text = text.replace(source, target)
         if ellipses_end:
-            if u"..." in cls.puncdict:
-                text += cls.puncdict[u"..."]
+            if "..." in cls.puncdict:
+                text += cls.puncdict["..."]
             else:
-                text += u"..."
+                text += "..."
         # Let's account for cases where a punctuation symbol plus a space is
         # replaced, but the space won't exist at the end of the source message.
         # As a simple improvement for messages ending in ellipses (...), we
         # test that the last character is different from the second last
         # This is only relevant if the string has two characters or more
-        if ((text[-1] + u" " in cls.puncdict) and
+        if ((text[-1] + " " in cls.puncdict) and
             (len(text) < 2 or text[-2] != text[-1])):
-            text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip()
+            text = text[:-1] + cls.puncdict[text[-1] + " "].rstrip()
         return text
 
     @classmethod
@@ -293,14 +295,14 @@ def alter_it(text):
             if l > 9:
                 extra = cls.length_difference(l)
                 if extra > 0:
-                    text = text[:extra].replace(u'\n', u'') + text
+                    text = text[:extra].replace('\n', '') + text
                 else:
                     text = text[-extra:]
             return text
         expanded = []
-        for subtext in text.split(u"\n\n"):
+        for subtext in text.split("\n\n"):
             expanded.append(alter_it(subtext))
-        text = u"\n\n".join(expanded)
+        text = "\n\n".join(expanded)
         return text
 
     @classmethod

diff --git a/translate/lang/el.py b/translate/lang/el.py
@@ -23,6 +23,8 @@
 .. seealso:: http://en.wikipedia.org/wiki/Greek_language
 """
 
+from __future__ import unicode_literals
+
 import re
 
 from translate.lang import common
@@ -32,9 +34,9 @@ class el(common.Common):
     """This class represents Greek."""
 
     # Greek uses ; as question mark and the middot instead
-    sentenceend = u".!;…"
+    sentenceend = ".!;…"
 
-    sentencere = re.compile(ur"""
+    sentencere = re.compile(r"""
         (?s)        # make . also match newlines
         .*?         # anything, but match non-greedy
         [%s]        # the puntuation for sentence ending
@@ -43,19 +45,19 @@ class el(common.Common):
         """ % sentenceend, re.VERBOSE | re.UNICODE)
 
     puncdict = {
-        u"?": u";",
-        u";": u"·",
+        "?": ";",
+        ";": "·",
     }
 
     # Valid latin characters for use as accelerators
-    valid_latin_accel = u"abcdefghijklmnopqrstuvwxyz" + \
-                        u"ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
-                        u"1234567890"
+    valid_latin_accel = ("abcdefghijklmnopqrstuvwxyz"
+                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                         "1234567890")
 
     # Valid greek characters for use as accelerators (accented characters
     # and "ς" omitted)
-    valid_greek_accel = u"αβγδεζηθικλμνξοπρστυφχψω" + \
-                        u"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
+    valid_greek_accel = ("αβγδεζηθικλμνξοπρστυφχψω"
+                         "ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ")
 
     # Valid accelerators
-    validaccel = u"".join([valid_latin_accel, valid_greek_accel])
+    validaccel = "".join([valid_latin_accel, valid_greek_accel])
diff --git a/translate/lang/hy.py b/translate/lang/hy.py
@@ -23,6 +23,8 @@
 .. seealso:: http://en.wikipedia.org/wiki/Armenian_language
 """
 
+from __future__ import unicode_literals
+
 import re
 
 from translate.lang import common
@@ -31,14 +33,14 @@
 class hy(common.Common):
     """This class represents Armenian."""
 
-    armenianpunc = u"։՝՜՞"
+    armenianpunc = "։՝՜՞"
 
-    punctuation = u"".join([common.Common.commonpunc, common.Common.quotes,
-                            common.Common.miscpunc, armenianpunc])
+    punctuation = "".join([common.Common.commonpunc, common.Common.quotes,
+                           common.Common.miscpunc, armenianpunc])
 
-    sentenceend = u"։՝՜…"
+    sentenceend = "։՝՜…"
 
-    sentencere = re.compile(ur"""
+    sentencere = re.compile(r"""
         (?s)        # make . also match newlines
         .*?         # anything, but match non-greedy
         [%s]        # the puntuation for sentence ending
@@ -47,10 +49,10 @@ class hy(common.Common):
         """ % sentenceend, re.VERBOSE | re.UNICODE)
 
     puncdict = {
-        u".": u"։",
-        u":": u"՝",
-        u"!": u"՜",
-        u"?": u"՞",
+        ".": "։",
+        ":": "՝",
+        "!": "՜",
+        "?": "՞",
     }
 
     ignoretests = ["startcaps", "simplecaps"]

diff --git a/translate/storage/placeables/general.py b/translate/storage/placeables/general.py
@@ -71,7 +71,7 @@ class NumberPlaceable(Ph):
     """Placeable for numbers."""
 
     istranslatable = False
-    regex = re.compile(ur"[-+]?[0-9]+([\u00a0.,][0-9]+)*")
+    regex = re.compile(u"[-+]?[0-9]+([\u00a0.,][0-9]+)*")
     parse = classmethod(regex_parse)
 
 
@@ -211,17 +211,20 @@ class PunctuationPlaceable(Ph):
     # FIXME this should really be a list created as being the inverse of what
     # is available on the translators keyboard.  Or easily expanded by their
     # configuration.
-    regex = re.compile(ur'''([™©®]|          # Marks
-                             [℃℉°]|          # Degree related
-                             [±πθ×÷−√∞∆Σ′″]| # Maths
-                             [‘’ʼ‚‛“”„‟]|    # Quote characters
-                             [«»]|           # Guillemets
-                             [£¥€]|          # Currencies
-                             …|              # U2026 - horizontal ellipsis
-                             —|              # U2014 - em dash
-                             –|              # U2013 - en dash
-                             [ ]             # U202F - narrow no-break space
-                            )+''', re.VERBOSE)
+    regex = re.compile(
+        u'''([™©®]|          # Marks
+             [℃℉°]|          # Degree related
+             [±πθ×÷−√∞∆Σ′″]| # Maths
+             [‘’ʼ‚‛“”„‟]|    # Quote characters
+             [«»]|           # Guillemets
+             [£¥€]|          # Currencies
+             …|              # U2026 - horizontal ellipsis
+             —|              # U2014 - em dash
+             –|              # U2013 - en dash
+             [ ]             # U202F - narrow no-break space
+            )+''',
+        re.VERBOSE
+    )
     parse = classmethod(regex_parse)
 
 

diff --git a/translate/storage/test_po.py b/translate/storage/test_po.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 
+import six
 from pytest import mark, raises
 
 from translate.misc import wStringIO
@@ -848,7 +849,7 @@ def test_unfinished_lines(self):
         assert str(pofile1) == str(pofile2)
 
     def test_encoding_change(self):
-        posource = ur'''
+        posource = r'''
 msgid ""
 msgstr ""
 "PO-Revision-Date: 2006-02-09 23:33+0200\n"
@@ -858,7 +859,9 @@ def test_encoding_change(self):
 
 msgid "a"
 msgstr "d"
-'''.encode('iso-8859-1')
+'''
+        if six.PY3:
+            posource = posource.encode()
         pofile = self.poparse(posource)
         unit = pofile.units[1]
         unit.target = u"ḓ"
@@ -868,7 +871,7 @@ def test_encoding_change(self):
 
     def test_istranslated(self):
         """checks that istranslated works ok."""
-        posource = ur'''
+        posource = r'''
 msgid ""
 msgstr ""
 "PO-Revision-Date: 2006-02-09 23:33+0200\n"
@@ -880,6 +883,8 @@ def test_istranslated(self):
 msgid_plural "aa"
 msgstr[0] ""
 '''
+        if six.PY2:
+            posource = posource.decode()
         pofile = self.poparse(posource)
         unit = pofile.units[1]
         print(str(unit))