Skip to content

Commit

Permalink
Reimplemted JSON (un)escaping
Browse files Browse the repository at this point in the history
  • Loading branch information
kbairak committed Feb 8, 2017
1 parent bb00d39 commit 4dea4a8
Show file tree
Hide file tree
Showing 2 changed files with 139 additions and 30 deletions.
116 changes: 100 additions & 16 deletions openformats/formats/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,9 @@ def _extract(self, parsed, nest=None):

@staticmethod
def _escape_key(key):
key = key.replace(u"\\", u"\\\\")
key = key.replace(u".", u"\\.")
key = key.replace(DumbJson.BACKSLASH,
u''.join([DumbJson.BACKSLASH, DumbJson.BACKSLASH]))
key = key.replace(u".", u''.join([DumbJson.BACKSLASH, '.']))
return key

def compile(self, template, stringset, **kwargs):
Expand Down Expand Up @@ -255,20 +256,103 @@ def _get_next_string(self):
except IndexError:
return None

@classmethod
def escape(cls, string):
return u''.join(cls._escape_generator(string))
# btw, this seems equivalent to
# return json.dumps(string, ensure_ascii=False)[1:-1]

@staticmethod
def escape(string):
escaped_string = string.replace(
'\\', r'\\'
).replace(
'"', '\\"'
).replace('\n', r'\n').replace('\r', r'\r')
return escaped_string
def _escape_generator(string):
for symbol in string:
if symbol == DumbJson.DOUBLE_QUOTES:
yield DumbJson.BACKSLASH
yield DumbJson.DOUBLE_QUOTES
elif symbol == DumbJson.BACKSLASH:
yield DumbJson.BACKSLASH
yield DumbJson.BACKSLASH
elif symbol == DumbJson.BACKSPACE:
yield DumbJson.BACKSLASH
yield u'b'
elif symbol == DumbJson.FORMFEED:
yield DumbJson.BACKSLASH
yield u'f'
elif symbol == DumbJson.NEWLINE:
yield DumbJson.BACKSLASH
yield u'n'
elif symbol == DumbJson.CARRIAGE_RETURN:
yield DumbJson.BACKSLASH
yield u'r'
elif symbol == DumbJson.TAB:
yield DumbJson.BACKSLASH
yield u't'
else:
yield symbol

@classmethod
def unescape(cls, string):
return u''.join(cls._unescape_generator(string))
# btw, this seems equivalent to
# return json.loads(u'"{}"'.format(string))

@staticmethod
def unescape(string):
unescaped_string = string.replace(
r'\\', '\\'
).replace(
r'\"', '"'
).replace(r'\n', '\n').replace(r'\r', '\r')
return unescaped_string
def _unescape_generator(string):
# I don't like this aldschool approach, but we may have to rewind a bit
ptr = 0
while True:
if ptr >= len(string):
break

symbol = string[ptr]

if symbol != DumbJson.BACKSLASH:
yield symbol
ptr += 1
continue

try:
next_symbol = string[ptr + 1]
except IndexError:
yield DumbJson.BACKSLASH
ptr += 1
continue

if next_symbol in (DumbJson.DOUBLE_QUOTES, DumbJson.FORWARD_SLASH,
DumbJson.BACKSLASH):
yield next_symbol
ptr += 2
elif next_symbol == u'b':
yield DumbJson.BACKSPACE
ptr += 2
elif next_symbol == u'f':
yield DumbJson.FORMFEED
ptr += 2
elif next_symbol == u'n':
yield DumbJson.NEWLINE
ptr += 2
elif next_symbol == u'r':
yield DumbJson.CARRIAGE_RETURN
ptr += 2
elif next_symbol == u't':
yield DumbJson.TAB
ptr += 2
elif next_symbol == u'u':
unicode_escaped = string[ptr:ptr + 6]
try:
unescaped = unicode_escaped.decode('unicode-escape')
except Exception:
yield DumbJson.BACKSLASH
yield u'u'
ptr += 2
continue
if len(unescaped) != 1:
yield DumbJson.BACKSLASH
yield u'u'
ptr += 2
continue
yield unescaped
ptr += 6

else:
yield symbol
ptr += 1
53 changes: 39 additions & 14 deletions openformats/utils/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,21 @@ class DumbJson(object):
>>> assert list(DumbJson('[null]')) == [(None, 2)]
"""

# Symbols
BACKSLASH = u'\\'
DOUBLE_QUOTES = u'"'
FORWARD_SLASH = u'/'
BACKSPACE = u'\b'
FORMFEED = u'\f'
NEWLINE = u'\n'
CARRIAGE_RETURN = u'\r'
TAB = u'\t'

def __init__(self, source, start=0):
self.source = source
self._end = None
starting_symbol, self.start = self._find_next('{[', start)
starting_symbol, self.start = self._find_next('{[', start,
require_whitespace=True)
if starting_symbol == '{':
self.type = dict
elif starting_symbol == '[':
Expand All @@ -77,29 +88,32 @@ def _iter_dict(self):
start = self.start + 1

# Maybe it's an empty dict
end, end_p = self._find_next('"}', start)
end, end_p = self._find_next([self.DOUBLE_QUOTES, '}'], start,
require_whitespace=True)
if end == "}":
self.end = end_p
return

while True:
# Lets find our key
_, start_key_quote_p = self._find_next('"', start)
_, start_key_quote_p = self._find_next(self.DOUBLE_QUOTES, start,
require_whitespace=True)
key_p = start_key_quote_p + 1
_, end_key_quote_p = self._find_next('"', key_p,
_, end_key_quote_p = self._find_next(self.DOUBLE_QUOTES, key_p,
require_whitespace=False)
key = self.source[key_p:end_key_quote_p]
_, colon_p = self._find_next(':', end_key_quote_p + 1)
_, colon_p = self._find_next(':', end_key_quote_p + 1,
require_whitespace=True)
value_start_string, value_start_computed, value_start_p =\
self._process_value(colon_p + 1)

# Our job in each case is to yield something and set 'next_p' to
# where we should search for our next item
if value_start_string == '"':
if value_start_string == self.DOUBLE_QUOTES:
# We found a string!
value_p = value_start_p + 1
_, value_end_quote_p = self._find_next(
'"', value_p, require_whitespace=False
self.DOUBLE_QUOTES, value_p, require_whitespace=False
)
value = self.source[value_p:value_end_quote_p]
yield key, key_p, value, value_p
Expand All @@ -118,7 +132,9 @@ def _iter_dict(self):
# Something went wrong
raise ValueError("No JSON value could be decoded")

next_symbol, next_symbol_p = self._find_next(',}', next_p)
next_symbol, next_symbol_p = self._find_next(
',}', next_p, require_whitespace=True
)
if next_symbol == ',':
start = next_symbol_p + 1
elif next_symbol == '}':
Expand All @@ -144,10 +160,11 @@ def _iter_list(self):

# Our job in each case is to yield something and set 'next_p' to
# where we should search for our next item
if item_start_string == '"':
if item_start_string == self.DOUBLE_QUOTES:
# We found a string!
item_p = item_start_p + 1
_, end_item_quote_p = self._find_next('"', item_p,
_, end_item_quote_p = self._find_next(self.DOUBLE_QUOTES,
item_p,
require_whitespace=False)
item = self.source[item_p:end_item_quote_p]
yield item, item_p
Expand All @@ -166,7 +183,9 @@ def _iter_list(self):
# Something went wrong
raise ValueError("No JSON value could be decoded")

next_symbol, next_symbol_p = self._find_next(',]', next_p)
next_symbol, next_symbol_p = self._find_next(
',]', next_p, require_whitespace=True
)
if next_symbol == ',':
start = next_symbol_p + 1
elif next_symbol == ']':
Expand All @@ -181,14 +200,14 @@ def _find_next(self, symbols, start=0, require_whitespace=True):
if candidate == '\\':
after_backslash = not after_backslash
if candidate in symbols:
if candidate == '"' and after_backslash:
if candidate == self.DOUBLE_QUOTES and after_backslash:
after_backslash = False
continue
return candidate, ptr
if candidate != '\\':
after_backslash = False
if require_whitespace and not candidate.isspace():
newline_count = self.source.count('\n', 0, ptr)
newline_count = self.source.count(self.NEWLINE, 0, ptr)
raise ValueError(
u"Was expecting whitespace or one of `{symbols}` on line "
u"{line_no}, found `{candidate}` instead".format(
Expand Down Expand Up @@ -231,7 +250,7 @@ def _process_value(self, start):
if match:
spaces, value = match.groups()
value_start = start + len(spaces)
if value in ('{', '[', '"'):
if value in ('{', '[', self.DOUBLE_QUOTES):
return value, None, value_start
else:
# We either have true/false/null or a number of sorts
Expand All @@ -252,3 +271,9 @@ def end(self):
@end.setter
def end(self, value):
self._end = value


for symbol in (DumbJson.BACKSLASH, DumbJson.DOUBLE_QUOTES,
DumbJson.FORWARD_SLASH, DumbJson.BACKSPACE, DumbJson.FORMFEED,
DumbJson.NEWLINE, DumbJson.CARRIAGE_RETURN, DumbJson.TAB):
assert len(symbol) == 1

0 comments on commit 4dea4a8

Please sign in to comment.