diff --git a/openformats/formats/json.py b/openformats/formats/json.py index 63f5ffc4..265f0928 100644 --- a/openformats/formats/json.py +++ b/openformats/formats/json.py @@ -92,8 +92,9 @@ def _extract(self, parsed, nest=None): @staticmethod def _escape_key(key): - key = key.replace(u"\\", u"\\\\") - key = key.replace(u".", u"\\.") + key = key.replace(DumbJson.BACKSLASH, + u''.join([DumbJson.BACKSLASH, DumbJson.BACKSLASH])) + key = key.replace(u".", u''.join([DumbJson.BACKSLASH, '.'])) return key def compile(self, template, stringset, **kwargs): @@ -255,20 +256,103 @@ def _get_next_string(self): except IndexError: return None + @classmethod + def escape(cls, string): + return u''.join(cls._escape_generator(string)) + # btw, this seems equivalent to + # return json.dumps(string, ensure_ascii=False)[1:-1] + @staticmethod - def escape(string): - escaped_string = string.replace( - '\\', r'\\' - ).replace( - '"', '\\"' - ).replace('\n', r'\n').replace('\r', r'\r') - return escaped_string + def _escape_generator(string): + for symbol in string: + if symbol == DumbJson.DOUBLE_QUOTES: + yield DumbJson.BACKSLASH + yield DumbJson.DOUBLE_QUOTES + elif symbol == DumbJson.BACKSLASH: + yield DumbJson.BACKSLASH + yield DumbJson.BACKSLASH + elif symbol == DumbJson.BACKSPACE: + yield DumbJson.BACKSLASH + yield u'b' + elif symbol == DumbJson.FORMFEED: + yield DumbJson.BACKSLASH + yield u'f' + elif symbol == DumbJson.NEWLINE: + yield DumbJson.BACKSLASH + yield u'n' + elif symbol == DumbJson.CARRIAGE_RETURN: + yield DumbJson.BACKSLASH + yield u'r' + elif symbol == DumbJson.TAB: + yield DumbJson.BACKSLASH + yield u't' + else: + yield symbol + + @classmethod + def unescape(cls, string): + return u''.join(cls._unescape_generator(string)) + # btw, this seems equivalent to + # return json.loads(u'"{}"'.format(string)) @staticmethod - def unescape(string): - unescaped_string = string.replace( - r'\\', '\\' - ).replace( - r'\"', '"' - ).replace(r'\n', '\n').replace(r'\r', '\r') - return unescaped_string + def _unescape_generator(string): + # I don't like this aldschool approach, but we may have to rewind a bit + ptr = 0 + while True: + if ptr >= len(string): + break + + symbol = string[ptr] + + if symbol != DumbJson.BACKSLASH: + yield symbol + ptr += 1 + continue + + try: + next_symbol = string[ptr + 1] + except IndexError: + yield DumbJson.BACKSLASH + ptr += 1 + continue + + if next_symbol in (DumbJson.DOUBLE_QUOTES, DumbJson.FORWARD_SLASH, + DumbJson.BACKSLASH): + yield next_symbol + ptr += 2 + elif next_symbol == u'b': + yield DumbJson.BACKSPACE + ptr += 2 + elif next_symbol == u'f': + yield DumbJson.FORMFEED + ptr += 2 + elif next_symbol == u'n': + yield DumbJson.NEWLINE + ptr += 2 + elif next_symbol == u'r': + yield DumbJson.CARRIAGE_RETURN + ptr += 2 + elif next_symbol == u't': + yield DumbJson.TAB + ptr += 2 + elif next_symbol == u'u': + unicode_escaped = string[ptr:ptr + 6] + try: + unescaped = unicode_escaped.decode('unicode-escape') + except Exception: + yield DumbJson.BACKSLASH + yield u'u' + ptr += 2 + continue + if len(unescaped) != 1: + yield DumbJson.BACKSLASH + yield u'u' + ptr += 2 + continue + yield unescaped + ptr += 6 + + else: + yield symbol + ptr += 1 diff --git a/openformats/utils/json.py b/openformats/utils/json.py index bdbde6ee..08a65333 100644 --- a/openformats/utils/json.py +++ b/openformats/utils/json.py @@ -54,10 +54,21 @@ class DumbJson(object): >>> assert list(DumbJson('[null]')) == [(None, 2)] """ + # Symbols + BACKSLASH = u'\\' + DOUBLE_QUOTES = u'"' + FORWARD_SLASH = u'/' + BACKSPACE = u'\b' + FORMFEED = u'\f' + NEWLINE = u'\n' + CARRIAGE_RETURN = u'\r' + TAB = u'\t' + def __init__(self, source, start=0): self.source = source self._end = None - starting_symbol, self.start = self._find_next('{[', start) + starting_symbol, self.start = self._find_next('{[', start, + require_whitespace=True) if starting_symbol == '{': self.type = dict elif starting_symbol == '[': @@ -77,29 +88,32 @@ def _iter_dict(self): start = self.start + 1 # Maybe it's an empty dict - end, end_p = self._find_next('"}', start) + end, end_p = self._find_next([self.DOUBLE_QUOTES, '}'], start, + require_whitespace=True) if end == "}": self.end = end_p return while True: # Lets find our key - _, start_key_quote_p = self._find_next('"', start) + _, start_key_quote_p = self._find_next(self.DOUBLE_QUOTES, start, + require_whitespace=True) key_p = start_key_quote_p + 1 - _, end_key_quote_p = self._find_next('"', key_p, + _, end_key_quote_p = self._find_next(self.DOUBLE_QUOTES, key_p, require_whitespace=False) key = self.source[key_p:end_key_quote_p] - _, colon_p = self._find_next(':', end_key_quote_p + 1) + _, colon_p = self._find_next(':', end_key_quote_p + 1, + require_whitespace=True) value_start_string, value_start_computed, value_start_p =\ self._process_value(colon_p + 1) # Our job in each case is to yield something and set 'next_p' to # where we should search for our next item - if value_start_string == '"': + if value_start_string == self.DOUBLE_QUOTES: # We found a string! value_p = value_start_p + 1 _, value_end_quote_p = self._find_next( - '"', value_p, require_whitespace=False + self.DOUBLE_QUOTES, value_p, require_whitespace=False ) value = self.source[value_p:value_end_quote_p] yield key, key_p, value, value_p @@ -118,7 +132,9 @@ def _iter_dict(self): # Something went wrong raise ValueError("No JSON value could be decoded") - next_symbol, next_symbol_p = self._find_next(',}', next_p) + next_symbol, next_symbol_p = self._find_next( + ',}', next_p, require_whitespace=True + ) if next_symbol == ',': start = next_symbol_p + 1 elif next_symbol == '}': @@ -144,10 +160,11 @@ def _iter_list(self): # Our job in each case is to yield something and set 'next_p' to # where we should search for our next item - if item_start_string == '"': + if item_start_string == self.DOUBLE_QUOTES: # We found a string! item_p = item_start_p + 1 - _, end_item_quote_p = self._find_next('"', item_p, + _, end_item_quote_p = self._find_next(self.DOUBLE_QUOTES, + item_p, require_whitespace=False) item = self.source[item_p:end_item_quote_p] yield item, item_p @@ -166,7 +183,9 @@ def _iter_list(self): # Something went wrong raise ValueError("No JSON value could be decoded") - next_symbol, next_symbol_p = self._find_next(',]', next_p) + next_symbol, next_symbol_p = self._find_next( + ',]', next_p, require_whitespace=True + ) if next_symbol == ',': start = next_symbol_p + 1 elif next_symbol == ']': @@ -181,14 +200,14 @@ def _find_next(self, symbols, start=0, require_whitespace=True): if candidate == '\\': after_backslash = not after_backslash if candidate in symbols: - if candidate == '"' and after_backslash: + if candidate == self.DOUBLE_QUOTES and after_backslash: after_backslash = False continue return candidate, ptr if candidate != '\\': after_backslash = False if require_whitespace and not candidate.isspace(): - newline_count = self.source.count('\n', 0, ptr) + newline_count = self.source.count(self.NEWLINE, 0, ptr) raise ValueError( u"Was expecting whitespace or one of `{symbols}` on line " u"{line_no}, found `{candidate}` instead".format( @@ -231,7 +250,7 @@ def _process_value(self, start): if match: spaces, value = match.groups() value_start = start + len(spaces) - if value in ('{', '[', '"'): + if value in ('{', '[', self.DOUBLE_QUOTES): return value, None, value_start else: # We either have true/false/null or a number of sorts @@ -252,3 +271,9 @@ def end(self): @end.setter def end(self, value): self._end = value + + +for symbol in (DumbJson.BACKSLASH, DumbJson.DOUBLE_QUOTES, + DumbJson.FORWARD_SLASH, DumbJson.BACKSPACE, DumbJson.FORMFEED, + DumbJson.NEWLINE, DumbJson.CARRIAGE_RETURN, DumbJson.TAB): + assert len(symbol) == 1