Skip to content

Commit

Permalink
pythongh-119182: Add PyUnicodeWriter_DecodeUTF8Stateful()
Browse files Browse the repository at this point in the history
Add PyUnicodeWriter_WriteWideChar() and
PyUnicodeWriter_DecodeUTF8Stateful() functions.
  • Loading branch information
vstinner committed Jun 17, 2024
1 parent 5c4235c commit 8aa73b7
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 3 deletions.
33 changes: 30 additions & 3 deletions Doc/c-api/unicode.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1551,9 +1551,17 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
To use a different error handler than ``strict``,
:c:func:`PyUnicode_DecodeUTF8` can be used with
:c:func:`PyUnicodeWriter_WriteStr`.
See also :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.
.. c:function:: PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer, wchar_t *str, Py_ssize_t size)
Writer the wide string *str* into *writer*.
*size* is a number of wide characters. If *size* is equal to ``-1``, call
``wcslen(str)`` to get the string length.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
.. c:function:: int PyUnicodeWriter_WriteStr(PyUnicodeWriter *writer, PyObject *obj)
Expand Down Expand Up @@ -1586,3 +1594,22 @@ object.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
.. c:function:: int PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer, const char *string, Py_ssize_t length, const char *errors, Py_ssize_t *consumed)
Decode the string *str* from UTF-8 with *errors* error handler and write the
output into *writer*.
*size* is the string length in bytes. If *size* is equal to ``-1``, call
``strlen(str)`` to get the string length.
*errors* is an error handler name, such as ``"replace"``. If *errors* is
``NULL``, use the strict error handler.
If *consumed* is not ``NULL``, set *\*consumed* to the number of decoded
bytes on success.
On success, return ``0``.
On error, set an exception, leave the writer unchanged, and return ``-1``.
See also :c:func:`PyUnicodeWriter_WriteUTF8`.
2 changes: 2 additions & 0 deletions Doc/whatsnew/3.14.rst
Original file line number Diff line number Diff line change
Expand Up @@ -291,10 +291,12 @@ New Features
* :c:func:`PyUnicodeWriter_Finish`.
* :c:func:`PyUnicodeWriter_WriteChar`.
* :c:func:`PyUnicodeWriter_WriteUTF8`.
* :c:func:`PyUnicodeWriter_WriteWideChar`.
* :c:func:`PyUnicodeWriter_WriteStr`.
* :c:func:`PyUnicodeWriter_WriteRepr`.
* :c:func:`PyUnicodeWriter_WriteSubstring`.
* :c:func:`PyUnicodeWriter_Format`.
* :c:func:`PyUnicodeWriter_DecodeUTF8Stateful`.

(Contributed by Victor Stinner in :gh:`119182`.)

Expand Down
10 changes: 10 additions & 0 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,10 @@ PyAPI_FUNC(int) PyUnicodeWriter_WriteUTF8(
PyUnicodeWriter *writer,
const char *str,
Py_ssize_t size);
PyAPI_FUNC(int) PyUnicodeWriter_WriteWideChar(
PyUnicodeWriter *writer,
wchar_t *str,
Py_ssize_t size);

PyAPI_FUNC(int) PyUnicodeWriter_WriteStr(
PyUnicodeWriter *writer,
Expand All @@ -475,6 +479,12 @@ PyAPI_FUNC(int) PyUnicodeWriter_Format(
PyUnicodeWriter *writer,
const char *format,
...);
PyAPI_FUNC(int) PyUnicodeWriter_DecodeUTF8Stateful(
PyUnicodeWriter *writer,
const char *string, /* UTF-8 encoded string */
Py_ssize_t length, /* size of string */
const char *errors, /* error handling */
Py_ssize_t *consumed); /* bytes consumed */


/* --- Private _PyUnicodeWriter API --------------------------------------- */
Expand Down
121 changes: 121 additions & 0 deletions Modules/_testcapi/unicode.c
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,88 @@ test_unicodewriter_recover_error(PyObject *self, PyObject *Py_UNUSED(args))
}


static PyObject *
test_unicodewriter_decode_utf8(PyObject *self, PyObject *Py_UNUSED(args))
{
// test PyUnicodeWriter_DecodeUTF8Stateful()
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "ign\xFFore", -1, "ignore", NULL) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "replace\xFF", -1, "replace", NULL) < 0) {
goto error;
}

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result, "ignore-replace\xef\xbf\xbd"));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyObject *
test_unicodewriter_decode_utf8_consumed(PyObject *self, PyObject *Py_UNUSED(args))
{
// test PyUnicodeWriter_DecodeUTF8Stateful()
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
Py_ssize_t consumed;

// valid string
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "text", -1, NULL, &consumed) < 0) {
goto error;
}
assert(consumed == 4);

if (PyUnicodeWriter_WriteChar(writer, '-') < 0) {
goto error;
}

// consumed is 0 if write fails
consumed = 12345;
assert(PyUnicodeWriter_DecodeUTF8Stateful(writer, "invalid\xFF", -1, NULL, &consumed) < 0);
PyErr_Clear();
assert(consumed == 0);

// ignore error handler
consumed = 12345;
if (PyUnicodeWriter_DecodeUTF8Stateful(writer, "more\xFF", -1, "ignore", &consumed) < 0) {
goto error;
}
assert(consumed == 5);

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result, "text-more"));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyObject *
test_unicodewriter_format(PyObject *self, PyObject *Py_UNUSED(args))
{
Expand Down Expand Up @@ -436,6 +518,42 @@ test_unicodewriter_format_recover_error(PyObject *self, PyObject *Py_UNUSED(args
}


static PyObject *
test_unicodewriter_widechar(PyObject *self, PyObject *Py_UNUSED(args))
{
PyUnicodeWriter *writer = PyUnicodeWriter_Create(0);
if (writer == NULL) {
return NULL;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"latin1=\xE9 IGNORED", 8) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"-", 1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteWideChar(writer, L"euro=\u20AC", -1) < 0) {
goto error;
}
if (PyUnicodeWriter_WriteChar(writer, '.') < 0) {
goto error;
}

PyObject *result = PyUnicodeWriter_Finish(writer);
if (result == NULL) {
return NULL;
}
assert(PyUnicode_EqualToUTF8(result,
"latin1=\xC3\xA9-euro=\xE2\x82\xAC."));
Py_DECREF(result);

Py_RETURN_NONE;

error:
PyUnicodeWriter_Discard(writer);
return NULL;
}


static PyMethodDef TestMethods[] = {
{"unicode_new", unicode_new, METH_VARARGS},
{"unicode_fill", unicode_fill, METH_VARARGS},
Expand All @@ -448,8 +566,11 @@ static PyMethodDef TestMethods[] = {
{"test_unicodewriter_utf8", test_unicodewriter_utf8, METH_NOARGS},
{"test_unicodewriter_invalid_utf8", test_unicodewriter_invalid_utf8, METH_NOARGS},
{"test_unicodewriter_recover_error", test_unicodewriter_recover_error, METH_NOARGS},
{"test_unicodewriter_decode_utf8", test_unicodewriter_decode_utf8, METH_NOARGS},
{"test_unicodewriter_decode_utf8_consumed", test_unicodewriter_decode_utf8_consumed, METH_NOARGS},
{"test_unicodewriter_format", test_unicodewriter_format, METH_NOARGS},
{"test_unicodewriter_format_recover_error", test_unicodewriter_format_recover_error, METH_NOARGS},
{"test_unicodewriter_widechar", test_unicodewriter_widechar, METH_NOARGS},
{NULL},
};

Expand Down
46 changes: 46 additions & 0 deletions Objects/unicodeobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -13500,6 +13500,52 @@ PyUnicodeWriter_WriteUTF8(PyUnicodeWriter *writer,
return res;
}


int
PyUnicodeWriter_DecodeUTF8Stateful(PyUnicodeWriter *writer,
const char *string,
Py_ssize_t length,
const char *errors,
Py_ssize_t *consumed)
{
if (length < 0) {
length = strlen(string);
}

_PyUnicodeWriter *_writer = (_PyUnicodeWriter*)writer;
Py_ssize_t old_pos = _writer->pos;
int res = unicode_decode_utf8_writer(_writer, string, length,
_Py_ERROR_UNKNOWN, errors, consumed);
if (res < 0) {
_writer->pos = old_pos;
if (consumed) {
*consumed = 0;
}
}
return res;
}


int
PyUnicodeWriter_WriteWideChar(PyUnicodeWriter *writer,
wchar_t *str,
Py_ssize_t size)
{
if (size < 0) {
size = wcslen(str);
}
PyObject *obj = PyUnicode_FromWideChar(str, size);
if (obj == NULL) {
return -1;
}

_PyUnicodeWriter *_writer = (_PyUnicodeWriter *)writer;
int res = _PyUnicodeWriter_WriteStr(_writer, obj);
Py_DECREF(obj);
return res;
}


int
_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
const char *str, Py_ssize_t len)
Expand Down

0 comments on commit 8aa73b7

Please sign in to comment.