ultrajson · Erotemic · Apr 17, 2022 · Apr 4, 2022 · Apr 5, 2022 · Apr 5, 2022
diff --git a/lib/ultrajson.h b/lib/ultrajson.h
@@ -258,7 +258,10 @@ typedef struct __JSONObjectEncoder
 
   /*
   Configuration for spaces of indent */
-  int indent;
+  int indentLength;
+  const char* indentChars;
+  int indentIsSpace;  // encodes if the indent is given in indentChars or if it should just be pure spaces
+  int indentEnabled;  // the user can request an indent of length 0. This encodes if the indent is enabled or not.
 
   /*
   If true, NaN will be encoded as a string matching the Python standard library's JSON behavior.
@@ -300,18 +303,21 @@ obj - An anonymous type representing the object
 enc - Function definitions for querying JSOBJ type
 buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
 cbBuffer - Length of buffer (ignored if buffer is NULL)
+outLen - Will store the length of the encoded string
 
 Returns:
-Encoded JSON object as a null terminated char string.
+Encoded JSON object as a char string.
 
 NOTE:
 If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
 Life cycle of the provided buffer must still be handled by caller.
 
 If the return value doesn't equal the specified buffer caller must release the memory using
 JSONObjectEncoder.free or free() as specified when calling this function.
+
+If an error occurs during encoding, NULL is returned and no outLen is stored.
 */
-EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer);
+EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer, size_t *outLen);
 
 typedef struct __JSONObjectDecoder
 {

diff --git a/lib/ultrajsonenc.c b/lib/ultrajsonenc.c
@@ -57,6 +57,8 @@ Numeric decoder derived from from TCL library
 #define snprintf sprintf_s
 #endif
 
+
+
 /*
 Worst cases being:
 
@@ -544,16 +546,27 @@ static FASTCALL_ATTR INLINE_PREFIX void FASTCALL_MSVC strreverse(char* begin, ch
 
 static void Buffer_AppendIndentNewlineUnchecked(JSONObjectEncoder *enc)
 {
-  if (enc->indent > 0) Buffer_AppendCharUnchecked(enc, '\n');
+  if (enc->indentEnabled) Buffer_AppendCharUnchecked(enc, '\n');
 }
 
 static void Buffer_AppendIndentUnchecked(JSONObjectEncoder *enc, JSINT32 value)
 {
   int i;
-  if (enc->indent > 0)
-    while (value-- > 0)
-      for (i = 0; i < enc->indent; i++)
-        Buffer_AppendCharUnchecked(enc, ' ');
+  if (enc->indentEnabled)
+  {
+    if (enc->indentIsSpace == 1)
+    {
+      while (value-- > 0)
+        for (i = 0; i < enc->indentLength; i++)
+          Buffer_AppendCharUnchecked(enc, ' ');
+    }
+    else
+    {
+      while (value-- > 0)
+        for (i = 0; i < enc->indentLength; i++)
+          Buffer_AppendCharUnchecked(enc, enc->indentChars[i]);
+    }
+  }
 }
 
 static void Buffer_AppendLongUnchecked(JSONObjectEncoder *enc, JSINT64 value)
@@ -655,7 +668,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
     Buffer_AppendCharUnchecked(enc, '\"');
 
     Buffer_AppendCharUnchecked (enc, ':');
-    if (enc->indent)
+    if (enc->indentEnabled)
     {
       Buffer_AppendCharUnchecked (enc, ' ');
     }
@@ -698,7 +711,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
       while (enc->iterNext(obj, &tc))
       {
         // The extra 2 bytes cover the comma and (optional) newline.
-        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2);
+        Buffer_Reserve (enc, enc->indentLength * (enc->level + 1) + 2);
 
         if (count > 0)
         {
@@ -725,7 +738,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
 
       if (count > 0) {
         // Reserve space for the indentation plus the newline.
-        Buffer_Reserve (enc, enc->indent * enc->level + 1);
+        Buffer_Reserve (enc, enc->indentLength * enc->level + 1);
         Buffer_AppendIndentNewlineUnchecked (enc);
         Buffer_AppendIndentUnchecked (enc, enc->level);
       }
@@ -743,7 +756,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
       while ((res = enc->iterNext(obj, &tc)))
       {
         // The extra 2 bytes cover the comma and optional newline.
-        Buffer_Reserve (enc, enc->indent * (enc->level + 1) + 2);
+        Buffer_Reserve (enc, enc->indentLength * (enc->level + 1) + 2);
 
         if(res < 0)
         {
@@ -778,7 +791,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
       enc->iterEnd(obj, &tc);
 
       if (count > 0) {
-        Buffer_Reserve (enc, enc->indent * enc->level + 1);
+        Buffer_Reserve (enc, enc->indentLength * enc->level + 1);
         Buffer_AppendIndentNewlineUnchecked (enc);
         Buffer_AppendIndentUnchecked (enc, enc->level);
       }
@@ -905,7 +918,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
   enc->level--;
 }
 
-char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer)
+char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer, size_t *_outLen)
 {
   enc->malloc = enc->malloc ? enc->malloc : malloc;
   enc->free =  enc->free ? enc->free : free;
@@ -941,12 +954,11 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
 
   encode (obj, enc, NULL, 0);
 
-  Buffer_Reserve(enc, 1);
   if (enc->errorMsg)
   {
     return NULL;
   }
-  Buffer_AppendCharUnchecked(enc, '\0');
 
+  *_outLen = enc->offset - enc->start;
   return enc->start;
 }
diff --git a/python/objToJSON.c b/python/objToJSON.c
@@ -114,29 +114,39 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, siz
   return PyBytes_AsString(obj);
 }
 
-static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
+static char *PyUnicodeToUTF8Raw(JSOBJ _obj, size_t *_outLen, PyObject *bytesObj)
 {
+  /*
+  Converts the PyUnicode object to char* whose size is stored in _outLen.
+  This conversion may require the creation of an intermediate PyBytes object.
+  In that case, the returned char* is in fact the internal buffer of that PyBytes object,
+  and when the char* buffer is no longer needed, the bytesObj must be DECREF'd.
+  */
   PyObject *obj = (PyObject *) _obj;
-  PyObject *newObj;
+
 #ifndef Py_LIMITED_API
   if (PyUnicode_IS_COMPACT_ASCII(obj))
   {
     Py_ssize_t len;
-    char *data = PyUnicode_AsUTF8AndSize(obj, &len);
+    const char *data = PyUnicode_AsUTF8AndSize(obj, &len);
     *_outLen = len;
     return data;
   }
 #endif
-  newObj = PyUnicode_AsUTF8String(obj);
-  if(!newObj)
+
+  bytesObj = PyUnicode_AsEncodedString (obj, "utf-8", "surrogatepass");
+  if (!bytesObj)
   {
     return NULL;
   }
 
-  GET_TC(tc)->newObj = newObj;
+  *_outLen = PyBytes_Size(bytesObj);
+  return PyBytes_AsString(bytesObj);
+}
 
-  *_outLen = PyBytes_Size(newObj);
-  return PyBytes_AsString(newObj);
+static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
+{
+  return PyUnicodeToUTF8Raw(_obj, _outLen, GET_TC(tc)->newObj);
 }
 
 static void *PyRawJSONToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
@@ -240,7 +250,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
   if (PyUnicode_Check(GET_TC(tc)->itemName))
   {
     itemNameTmp = GET_TC(tc)->itemName;
-    GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
+    GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
     Py_DECREF(itemNameTmp);
   }
   else
@@ -263,7 +273,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
       return -1;
     }
     itemNameTmp = GET_TC(tc)->itemName;
-    GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
+    GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
     Py_DECREF(itemNameTmp);
   }
   PRINTMARK();
@@ -332,7 +342,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
       // Subject the key to the same type restrictions and conversions as in Dict_iterGetValue.
       if (PyUnicode_Check(key))
       {
-        key = PyUnicode_AsUTF8String(key);
+        key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
       }
       else if (!PyBytes_Check(key))
       {
@@ -342,7 +352,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
           goto error;
         }
         keyTmp = key;
-        key = PyUnicode_AsUTF8String(key);
+        key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
         Py_DECREF(keyTmp);
       }
       else
@@ -754,6 +764,32 @@ static char *Object_iterGetName(JSOBJ obj, JSONTypeContext *tc, size_t *outLen)
   return GET_TC(tc)->iterGetName(obj, tc, outLen);
 }
 
+
+static const char *_PyUnicodeToChars(PyObject *obj, int *_outLen)
+{
+  // helper for indent only
+  // an error occurs when the return is NULL and _outLen is 0
+  PyObject *newObj;
+/*#ifndef Py_LIMITED_API*/
+  if (PyUnicode_IS_COMPACT_ASCII(obj))
+  {
+    Py_ssize_t len = 0;
+    const char *data = PyUnicode_AsUTF8AndSize(obj, &len);
+    *_outLen = (int) len;
+    return data;
+  }
+/*#endif*/
+  newObj = PyUnicode_AsEncodedString(obj, "utf-8", "surrogatepass");
+  if(!newObj)
+  {
+    *_outLen = 0;
+    return NULL;
+  }
+
+  *_outLen = PyBytes_Size(newObj);
+  return PyBytes_AsString(newObj);
+}
+
 PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
 {
   static char *kwlist[] = { "obj", "ensure_ascii", "encode_html_chars", "escape_forward_slashes", "sort_keys", "indent", "allow_nan", "reject_bytes", "default", NULL };
@@ -768,8 +804,10 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
   PyObject *oescapeForwardSlashes = NULL;
   PyObject *osortKeys = NULL;
   PyObject *odefaultFn = NULL;
+  PyObject *oindent = NULL;
   int allowNan = -1;
   int orejectBytes = -1;
+  size_t retLen;
 
   JSONObjectEncoder encoder =
   {
@@ -792,7 +830,10 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     0, //encodeHTMLChars
     1, //escapeForwardSlashes
     0, //sortKeys
-    0, //indent
+    0, //indentLength
+    NULL, //indentChars
+    0, // indentIsSpace
+    0, // indentEnabled
     1, //allowNan
     1, //rejectBytes
     NULL, //prv
@@ -801,7 +842,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
 
   PRINTMARK();
 
-  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOiiiO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &encoder.indent, &allowNan, &orejectBytes, &odefaultFn))
+  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O|OOOOOiiO", kwlist, &oinput, &oensureAscii, &oencodeHTMLChars, &oescapeForwardSlashes, &osortKeys, &oindent, &allowNan, &orejectBytes, &odefaultFn))
   {
     return NULL;
   }
@@ -826,6 +867,47 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     encoder.sortKeys = 1;
   }
 
+  if (oindent != NULL)
+  {
+    // Handle multiple input types
+    if (oindent == Py_None)
+    {
+        // Case where the indent is specified as None
+        // This should be exactly the same as if oindent is NULL
+        encoder.indentLength = 0;
+    }
+    else if (PyLong_Check(oindent))
+    {
+        // Case where the indent is specified as an integer
+        // In this case the indent characters should only be
+        // space chars - i.e. chr(32)
+        encoder.indentLength = PyLong_AsLong(oindent);
+        encoder.indentIsSpace = 1;
+        encoder.indentEnabled = 1;
+        if (encoder.indentLength < 0)
+        {
+            encoder.indentLength = 0;
+        }
+    }
+    else if (PyUnicode_Check(oindent))
+    {
+        // Case where custom UTF-8 indent is specified.
+        encoder.indentLength = -1; // set to -1 to indicate an error
+        encoder.indentChars = _PyUnicodeToChars(oindent, &encoder.indentLength);
+        encoder.indentEnabled = 1;
+        if(encoder.indentChars == NULL && encoder.indentLength == -1)
+        {
+            PyErr_SetString(PyExc_ValueError, "malformed indent");
+            return NULL;
+        }
+    }
+    else
+    {
+        PyErr_Format (PyExc_TypeError, "expected integer, None, or str indent");
+        return NULL;
+    }
+  }
+
   if (allowNan != -1)
   {
     encoder.allowNan = allowNan;
@@ -853,7 +935,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
                  csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0);
 
   PRINTMARK();
-  ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
+  ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer), &retLen);
   PRINTMARK();
 
   dconv_d2s_free(&encoder.d2s);
@@ -874,15 +956,14 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     return NULL;
   }
 
-  newobj = PyUnicode_FromString (ret);
+  newobj = PyUnicode_DecodeUTF8(ret, retLen, "surrogatepass");
 
   if (ret != buffer)
   {
     encoder.free (ret);
   }
 
   PRINTMARK();
-
   return newobj;
 }