Skip to content

Commit

Permalink
quote: performance improvements, code cleanups
Browse files Browse the repository at this point in the history
- use lists to generate output and join in the end instead of building
  intermediate strings using +=
- use f-strings where applicable
- add type hints
  • Loading branch information
nijel committed Mar 7, 2024
1 parent ee2e6a6 commit 78c6ac9
Showing 1 changed file with 64 additions and 58 deletions.
122 changes: 64 additions & 58 deletions translate/misc/quote.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@
delimiters.
"""

from __future__ import annotations

import html.entities
import logging
import re


def find_all(searchin, substr):
def find_all(searchin: str, substr: str) -> list[int]:
"""
Returns a list of locations where substr occurs in searchin locations
are not allowed to overlap.
Expand Down Expand Up @@ -236,14 +238,14 @@ def extractwithoutquotes(


# TODO: investigate if ord is needed
def _encode_entity_char(char, codepoint2name):
def _encode_entity_char(char: str, codepoint2name: dict[str, str]) -> str:
charnum = ord(char)
if charnum in codepoint2name:
return "&%s;" % codepoint2name[charnum]
return f"&{codepoint2name[charnum]};"
return char


def entityencode(source, codepoint2name):
def entityencode(source: str, codepoint2name: dict[str, str]) -> str:
"""
Encode ``source`` using entities from ``codepoint2name``.
Expand All @@ -252,7 +254,7 @@ def entityencode(source, codepoint2name):
(without the the leading ``&`` or the trailing ``;``)
:type codepoint2name: :meth:`dict`
"""
output = ""
output = []
inentity = False
for char in source:
if char == "&":
Expand All @@ -261,27 +263,27 @@ def entityencode(source, codepoint2name):
continue
if inentity:
if char == ";":
output += "&" + possibleentity + ";"
output.append(f"&{possibleentity};")
inentity = False
elif char == " ":
output += _encode_entity_char("&", codepoint2name) + entityencode(
possibleentity + char, codepoint2name
)
output.append(_encode_entity_char("&", codepoint2name))
output.append(entityencode(possibleentity + char, codepoint2name))
inentity = False
else:
possibleentity += char
else:
output += _encode_entity_char(char, codepoint2name)
output.append(_encode_entity_char(char, codepoint2name))
if inentity:
# Handle nonentities at end of string.
output += _encode_entity_char("&", codepoint2name) + entityencode(
possibleentity, codepoint2name
output.append(
_encode_entity_char("&", codepoint2name)
+ entityencode(possibleentity, codepoint2name)
)

return output
return "".join(output)


def _has_entity_end(source):
def _has_entity_end(source: str) -> bool:
for char in source:
if char == ";":
return True
Expand All @@ -290,7 +292,7 @@ def _has_entity_end(source):
return False


def entitydecode(source, name2codepoint):
def entitydecode(source: str, name2codepoint: dict[str, str]) -> str:
"""
Decode ``source`` using entities from ``name2codepoint``.
Expand All @@ -299,7 +301,7 @@ def entitydecode(source, name2codepoint):
the leading ``&`` or the trailing ``;``) to code points
:type name2codepoint: :meth:`dict`
"""
output = ""
output = []
inentity = False
for i, char in enumerate(source):
char = source[i]
Expand All @@ -312,24 +314,24 @@ def entitydecode(source, name2codepoint):
if len(possibleentity) > 0 and possibleentity in name2codepoint:
entchar = chr(name2codepoint[possibleentity])
if entchar == "&" and _has_entity_end(source[i + 1 :]):
output += "&" + possibleentity + ";"
output.append(f"&{possibleentity};")
else:
output += entchar
output.append(entchar)
inentity = False
else:
output += "&" + possibleentity + ";"
output.append(f"&{possibleentity};")
inentity = False
elif char == " ":
output += "&" + possibleentity + char
output.append(f"&{possibleentity}{char}")
inentity = False
else:
possibleentity += char
else:
output += char
output.append(char)
if inentity:
# Handle nonentities at end of string.
output += "&" + possibleentity
return output
output.append(f"&{possibleentity}")
return "".join(output)


def htmlentityencode(source):
Expand All @@ -341,7 +343,7 @@ def htmlentityencode(source):
return entityencode(source, html.entities.codepoint2name)


def htmlentitydecode(source):
def htmlentitydecode(source: str) -> str:
"""
Decode source using HTML entities e.g. ``©`` -> ©.
Expand All @@ -350,55 +352,55 @@ def htmlentitydecode(source):
return entitydecode(source, html.entities.name2codepoint)


def javapropertiesencode(source):
def javapropertiesencode(source: str) -> str:
"""
Encodes source in the escaped-unicode encoding used by Java
.properties files.
"""
output = ""
output = []
if source and source[0] == " ":
output = "\\"
output.append("\\")
for char in source:
charnum = ord(char)
if char in controlchars:
output += controlchars[char]
output.append(controlchars[char])
elif 0 <= charnum < 128:
output += str(char)
output.append(str(char))
else:
output += "\\u%04X" % charnum
return output
output.append(f"\\u{charnum:04X}")
return "".join(output)


def java_utf8_properties_encode(source):
def java_utf8_properties_encode(source: str) -> str:
"""
Encodes source in the escaped-unicode encoding used by java utf-8
.properties files.
"""
output = ""
output = []
for char in source:
if char in controlchars:
output += controlchars[char]
output.append(controlchars[char])
else:
output += char
return output
output.append(char)
return "".join(output)


def xwiki_properties_encode(source, encoding):
def xwiki_properties_encode(source: str, encoding: str) -> str:
if re.search(r"\{[0-9]+\}", source):
source = source.replace("'", "''")
if encoding == "utf-8":
return java_utf8_properties_encode(source)
return javapropertiesencode(source)


def escapespace(char):
def escapespace(char: str) -> str:
assert len(char) == 1
if char.isspace():
return "\\u%04X" % ord(char)
return f"\\u{ord(char):04X}"
return char


def mozillaescapemarginspaces(source):
def mozillaescapemarginspaces(source: str) -> str:
"""Escape leading and trailing spaces for Mozilla .properties files."""
if not source:
return ""
Expand Down Expand Up @@ -436,14 +438,14 @@ def mozillaescapemarginspaces(source):
}


def escapecontrols(source):
def escapecontrols(source: str) -> str:
"""Escape control characters in the given string."""
for key, value in controlchars.items():
source = source.replace(key, value)
return source


def propertiesdecode(source):
def propertiesdecode(source: str) -> str:
"""
Decodes source from the escaped-unicode encoding used by .properties
files.
Expand All @@ -454,7 +456,7 @@ def propertiesdecode(source):
don't want to we reimplemented the algorithm from Python Objects/unicode.c
in Python and modify it to retain escaped control characters.
"""
output = ""
output = []
s = 0

def unichr2(i):
Expand All @@ -468,27 +470,27 @@ def unichr2(i):
# we just return the character, unescaped
# if people want to escape them they can use escapecontrols
return chr(i)
return "\\u%04x" % i
return f"\\u{i:04x}"

while s < len(source):
c = source[s]
if c != "\\":
output += c
output.append(c)
s += 1
continue
s += 1
if s >= len(source):
# this is an escape at the end of the line, which implies
# a continuation..., return the escape to inform the parser
output += c
output.append(c)
continue
c = source[s]
s += 1
if c == "\n":
pass
# propertyescapes lookups
elif c in propertyescapes:
output += propertyescapes[c]
output.append(propertyescapes[c])
# \uXXXX escapes
# \UXXXX escapes
elif c in "uU":
Expand All @@ -509,46 +511,48 @@ def unichr2(i):
digits = digit
break
s += digits
output += unichr2(x)
output.append(unichr2(x))
elif c == "N":
if source[s] != "{":
logging.warning("Invalid named unicode escape: no { after \\N")
output += "\\" + c
output.append(f"\\{c}")
continue
s += 1
e = source.find("}", s)
if e == -1:
logging.warning("Invalid named unicode escape: no } after \\N{")
output += "\\" + c
output.append(f"\\{c}")
continue
import unicodedata

name = source[s:e]
output += unicodedata.lookup(name)
output.append(unicodedata.lookup(name))
s = e + 1
else:
output += c # Drop any \ that we don't specifically handle
return output
output.append(c) # Drop any \ that we don't specifically handle
return "".join(output)


def xwiki_properties_decode(source):
def xwiki_properties_decode(source: str) -> str:
if re.search(r"\{[0-9]+\}", source):
source = source.replace("''", "'")
return propertiesdecode(source)


def findend(string, substring):
def findend(string: str, substring: str) -> int:
s = string.find(substring)
if s != -1:
s += len(substring)
return s


def rstripeol(string):
def rstripeol(string: str) -> str:
return string.rstrip("\r\n")


def stripcomment(comment, startstring="<!--", endstring="-->"):
def stripcomment(
comment: str, startstring: str = "<!--", endstring: str = "-->"
) -> str:
cstart = comment.find(startstring)
if cstart == -1:
cstart = 0
Expand All @@ -558,5 +562,7 @@ def stripcomment(comment, startstring="<!--", endstring="-->"):
return comment[cstart:cend].strip()


def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
def unstripcomment(
comment: str, startstring: str = "<!-- ", endstring: str = " -->\n"
) -> str:
return startstring + comment.strip() + endstring

0 comments on commit 78c6ac9

Please sign in to comment.