Skip to content

Commit

Permalink
Avoided raw unicode strings for Python 3 compatibility
Browse files Browse the repository at this point in the history
Raw unicode strings syntax (e.g. ur'mystring') is not valid syntax on
Python 3.
Read https://www.python.org/dev/peps/pep-0414/#exclusion-of-raw-unicode-literals
This patch uses various techniques to workaround this issue: removing either
the raw or unicode markers when they are not essential, or using unicode_literals
to mimic the Python 3 behavior.
  • Loading branch information
claudep committed Aug 19, 2015
1 parent 1cd3097 commit 1880265
Show file tree
Hide file tree
Showing 8 changed files with 91 additions and 75 deletions.
2 changes: 1 addition & 1 deletion translate/convert/test_po2ical.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def merge2ical(self, propsource, posource):

def test_simple_summary(self):
"""test that we output correctly for Inno files."""
posource = ur'''#: [uid1@example.com]SUMMARY
posource = u'''#: [uid1@example.com]SUMMARY
msgid "Value"
msgstr "Waarde"
'''
Expand Down
6 changes: 4 additions & 2 deletions translate/convert/test_po2ini.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals

from pytest import importorskip

from translate.convert import po2ini, test_convert
Expand Down Expand Up @@ -95,13 +97,13 @@ def test_empty_value(self):

def test_dialects_inno(self):
"""test that we output correctly for Inno files."""
posource = ur'''#: [section]prop
posource = r'''#: [section]prop
msgid "value\tvalue2\n"
msgstr "ṽḁḽṻḝ\tṽḁḽṻḝ2\n"
'''
initemplate = '''[section]\nprop = value%tvalue%n\n'''
iniexpected = '''[section]\nprop = ṽḁḽṻḝ%tṽḁḽṻḝ2%n\n'''
inifile = self.merge2ini(initemplate, posource, "inno")
inifile = self.merge2ini(initemplate, posource, "inno").decode('utf-8')
print(inifile)
assert inifile == iniexpected

Expand Down
48 changes: 25 additions & 23 deletions translate/lang/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
- phrases
"""

from __future__ import unicode_literals

import logging
import re
import six
Expand Down Expand Up @@ -120,58 +122,58 @@ class Common(object):
"""This of languages that has different plural formula in Mozilla than the
standard one in Gettext."""

listseperator = u", "
listseperator = ", "
"""This string is used to separate lists of textual elements. Most
languages probably can stick with the default comma, but Arabic and some
Asian languages might want to override this."""

specialchars = u""
specialchars = ""
"""Characters used by the language that might not be easy to input with
common keyboard layouts"""

commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
commonpunc = ".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
"""These punctuation marks are common in English and most languages that
use latin script."""

quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
quotes = "‘’‛“”„‟′″‴‵‶‷‹›«»"
"""These are different quotation marks used by various languages."""

invertedpunc = u"¿¡"
invertedpunc = "¿¡"
"""Inverted punctuation sometimes used at the beginning of sentences in
Spanish, Asturian, Galician, and Catalan."""

rtlpunc = u"،؟؛÷"
rtlpunc = "،؟؛÷"
"""These punctuation marks are used by Arabic and Persian, for example."""

CJKpunc = u"。、,;!?「」『』【】"
CJKpunc = "。、,;!?「」『』【】"
"""These punctuation marks are used in certain circumstances with CJK
languages."""

indicpunc = u"।॥॰"
indicpunc = "।॥॰"
"""These punctuation marks are used by several Indic languages."""

ethiopicpunc = u"።፤፣"
ethiopicpunc = "።፤፣"
"""These punctuation marks are used by several Ethiopic languages."""

miscpunc = u"…±°¹²³·©®×£¥€"
miscpunc = "…±°¹²³·©®×£¥€"
"""The middle dot (·) is used by Greek and Georgian."""

punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,
indicpunc, ethiopicpunc, miscpunc])
punctuation = "".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,
indicpunc, ethiopicpunc, miscpunc])
"""We include many types of punctuation here, simply since this is only
meant to determine if something is punctuation. Hopefully we catch some
languages which might not be represented with modules. Most languages won't
need to override this."""

sentenceend = u".!?…։؟।。!?።\u06d4"
sentenceend = ".!?…։؟।。!?።\u06d4"
"""These marks can indicate a sentence end. Once again we try to account
for many languages. Most langauges won't need to override this."""

#The following tries to account for a lot of things. For the best idea of
#what works, see test_common.py. We try to ignore abbreviations, for
#example, by checking that the following sentence doesn't start with lower
#case or numbers.
sentencere = re.compile(ur"""
sentencere = re.compile(r"""
(?s) # make . also match newlines
.*? # anything, but match non-greedy
[%s] # the puntuation for sentence ending
Expand Down Expand Up @@ -243,24 +245,24 @@ def punctranslate(cls, text):
#TODO: look at po::escapeforpo() for performance idea
if not text:
return text
ellipses_end = text.endswith(u"...")
ellipses_end = text.endswith("...")
if ellipses_end:
text = text[:-3]
for source, target in six.iteritems(cls.puncdict):
text = text.replace(source, target)
if ellipses_end:
if u"..." in cls.puncdict:
text += cls.puncdict[u"..."]
if "..." in cls.puncdict:
text += cls.puncdict["..."]
else:
text += u"..."
text += "..."
# Let's account for cases where a punctuation symbol plus a space is
# replaced, but the space won't exist at the end of the source message.
# As a simple improvement for messages ending in ellipses (...), we
# test that the last character is different from the second last
# This is only relevant if the string has two characters or more
if ((text[-1] + u" " in cls.puncdict) and
if ((text[-1] + " " in cls.puncdict) and
(len(text) < 2 or text[-2] != text[-1])):
text = text[:-1] + cls.puncdict[text[-1] + u" "].rstrip()
text = text[:-1] + cls.puncdict[text[-1] + " "].rstrip()
return text

@classmethod
Expand Down Expand Up @@ -293,14 +295,14 @@ def alter_it(text):
if l > 9:
extra = cls.length_difference(l)
if extra > 0:
text = text[:extra].replace(u'\n', u'') + text
text = text[:extra].replace('\n', '') + text
else:
text = text[-extra:]
return text
expanded = []
for subtext in text.split(u"\n\n"):
for subtext in text.split("\n\n"):
expanded.append(alter_it(subtext))
text = u"\n\n".join(expanded)
text = "\n\n".join(expanded)
return text

@classmethod
Expand Down
22 changes: 12 additions & 10 deletions translate/lang/el.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
.. seealso:: http://en.wikipedia.org/wiki/Greek_language
"""

from __future__ import unicode_literals

import re

from translate.lang import common
Expand All @@ -32,9 +34,9 @@ class el(common.Common):
"""This class represents Greek."""

# Greek uses ; as question mark and the middot instead
sentenceend = u".!;…"
sentenceend = ".!;…"

sentencere = re.compile(ur"""
sentencere = re.compile(r"""
(?s) # make . also match newlines
.*? # anything, but match non-greedy
[%s] # the puntuation for sentence ending
Expand All @@ -43,19 +45,19 @@ class el(common.Common):
""" % sentenceend, re.VERBOSE | re.UNICODE)

puncdict = {
u"?": u";",
u";": u"·",
"?": ";",
";": "·",
}

# Valid latin characters for use as accelerators
valid_latin_accel = u"abcdefghijklmnopqrstuvwxyz" + \
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
u"1234567890"
valid_latin_accel = ("abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"1234567890")

# Valid greek characters for use as accelerators (accented characters
# and "ς" omitted)
valid_greek_accel = u"αβγδεζηθικλμνξοπρστυφχψω" + \
u"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ"
valid_greek_accel = ("αβγδεζηθικλμνξοπρστυφχψω"
"ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ")

# Valid accelerators
validaccel = u"".join([valid_latin_accel, valid_greek_accel])
validaccel = "".join([valid_latin_accel, valid_greek_accel])
20 changes: 11 additions & 9 deletions translate/lang/hy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
.. seealso:: http://en.wikipedia.org/wiki/Armenian_language
"""

from __future__ import unicode_literals

import re

from translate.lang import common
Expand All @@ -31,14 +33,14 @@
class hy(common.Common):
"""This class represents Armenian."""

armenianpunc = u"։՝՜՞"
armenianpunc = "։՝՜՞"

punctuation = u"".join([common.Common.commonpunc, common.Common.quotes,
common.Common.miscpunc, armenianpunc])
punctuation = "".join([common.Common.commonpunc, common.Common.quotes,
common.Common.miscpunc, armenianpunc])

sentenceend = u"։՝՜…"
sentenceend = "։՝՜…"

sentencere = re.compile(ur"""
sentencere = re.compile(r"""
(?s) # make . also match newlines
.*? # anything, but match non-greedy
[%s] # the puntuation for sentence ending
Expand All @@ -47,10 +49,10 @@ class hy(common.Common):
""" % sentenceend, re.VERBOSE | re.UNICODE)

puncdict = {
u".": u"։",
u":": u"՝",
u"!": u"՜",
u"?": u"՞",
".": "։",
":": "՝",
"!": "՜",
"?": "՞",
}

ignoretests = ["startcaps", "simplecaps"]
Expand Down
27 changes: 15 additions & 12 deletions translate/storage/placeables/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class NumberPlaceable(Ph):
"""Placeable for numbers."""

istranslatable = False
regex = re.compile(ur"[-+]?[0-9]+([\u00a0.,][0-9]+)*")
regex = re.compile(u"[-+]?[0-9]+([\u00a0.,][0-9]+)*")
parse = classmethod(regex_parse)


Expand Down Expand Up @@ -211,17 +211,20 @@ class PunctuationPlaceable(Ph):
# FIXME this should really be a list created as being the inverse of what
# is available on the translators keyboard. Or easily expanded by their
# configuration.
regex = re.compile(ur'''([™©®]| # Marks
[℃℉°]| # Degree related
[±πθ×÷−√∞∆Σ′″]| # Maths
[‘’ʼ‚‛“”„‟]| # Quote characters
[«»]| # Guillemets
[£¥€]| # Currencies
…| # U2026 - horizontal ellipsis
—| # U2014 - em dash
–| # U2013 - en dash
[ ] # U202F - narrow no-break space
)+''', re.VERBOSE)
regex = re.compile(
u'''([™©®]| # Marks
[℃℉°]| # Degree related
[±πθ×÷−√∞∆Σ′″]| # Maths
[‘’ʼ‚‛“”„‟]| # Quote characters
[«»]| # Guillemets
[£¥€]| # Currencies
…| # U2026 - horizontal ellipsis
—| # U2014 - em dash
–| # U2013 - en dash
[ ] # U202F - narrow no-break space
)+''',
re.VERBOSE
)
parse = classmethod(regex_parse)


Expand Down
11 changes: 8 additions & 3 deletions translate/storage/test_po.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import six
from pytest import mark, raises

from translate.misc import wStringIO
Expand Down Expand Up @@ -848,7 +849,7 @@ def test_unfinished_lines(self):
assert str(pofile1) == str(pofile2)

def test_encoding_change(self):
posource = ur'''
posource = r'''
msgid ""
msgstr ""
"PO-Revision-Date: 2006-02-09 23:33+0200\n"
Expand All @@ -858,7 +859,9 @@ def test_encoding_change(self):
msgid "a"
msgstr "d"
'''.encode('iso-8859-1')
'''
if six.PY3:
posource = posource.encode()
pofile = self.poparse(posource)
unit = pofile.units[1]
unit.target = u"ḓ"
Expand All @@ -868,7 +871,7 @@ def test_encoding_change(self):

def test_istranslated(self):
"""checks that istranslated works ok."""
posource = ur'''
posource = r'''
msgid ""
msgstr ""
"PO-Revision-Date: 2006-02-09 23:33+0200\n"
Expand All @@ -880,6 +883,8 @@ def test_istranslated(self):
msgid_plural "aa"
msgstr[0] ""
'''
if six.PY2:
posource = posource.decode()
pofile = self.poparse(posource)
unit = pofile.units[1]
print(str(unit))
Expand Down

0 comments on commit 1880265

Please sign in to comment.