From 5b489fc57c08b5e2b8344f116d2fd89afbe5d173 Mon Sep 17 00:00:00 2001 From: Serhiy Storchaka Date: Tue, 20 Feb 2024 18:09:50 +0200 Subject: [PATCH] gh-115712: Support CSV dialects with delimiter=' ' and skipinitialspace=True (GH-115721) Restore support of such combination, disabled in gh-113796. csv.writer() now quotes empty fields if delimiter is a space and skipinitialspace is true and raises exception if quoting is not possible. --- Lib/test/test_csv.py | 67 ++++++++++++++++--- ...-02-20-16-42-54.gh-issue-115712.EXVMXw.rst | 4 ++ Modules/_csv.c | 36 ++++++++-- 3 files changed, 90 insertions(+), 17 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2024-02-20-16-42-54.gh-issue-115712.EXVMXw.rst diff --git a/Lib/test/test_csv.py b/Lib/test/test_csv.py index 21a4cb586ff6658..5217f2a71ec0f96 100644 --- a/Lib/test/test_csv.py +++ b/Lib/test/test_csv.py @@ -64,8 +64,7 @@ def _test_arg_valid(self, ctor, arg): ctor(arg, delimiter='\t', skipinitialspace=True) ctor(arg, escapechar='\t', skipinitialspace=True) ctor(arg, quotechar='\t', skipinitialspace=True) - self.assertRaises(ValueError, ctor, arg, - delimiter=' ', skipinitialspace=True) + ctor(arg, delimiter=' ', skipinitialspace=True) self.assertRaises(ValueError, ctor, arg, escapechar=' ', skipinitialspace=True) self.assertRaises(ValueError, ctor, arg, @@ -192,9 +191,6 @@ def _write_error_test(self, exc, fields, **kwargs): def test_write_arg_valid(self): self._write_error_test(csv.Error, None) - self._write_test((), '') - self._write_test([None], '""') - self._write_error_test(csv.Error, [None], quoting = csv.QUOTE_NONE) # Check that exceptions are passed up the chain self._write_error_test(OSError, BadIterable()) class BadList: @@ -208,7 +204,6 @@ class BadItem: def __str__(self): raise OSError self._write_error_test(OSError, [BadItem()]) - def test_write_bigfield(self): # This exercises the buffer realloc functionality bigstring = 'X' * 50000 @@ -315,6 +310,49 @@ def test_writerows_with_none(self): fileobj.seek(0) self.assertEqual(fileobj.read(), 'a\r\n""\r\n') + + def test_write_empty_fields(self): + self._write_test((), '') + self._write_test([''], '""') + self._write_error_test(csv.Error, [''], quoting=csv.QUOTE_NONE) + self._write_test([''], '""', quoting=csv.QUOTE_STRINGS) + self._write_test([''], '""', quoting=csv.QUOTE_NOTNULL) + self._write_test([None], '""') + self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_NONE) + self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_STRINGS) + self._write_error_test(csv.Error, [None], quoting=csv.QUOTE_NOTNULL) + self._write_test(['', ''], ',') + self._write_test([None, None], ',') + + def test_write_empty_fields_space_delimiter(self): + self._write_test([''], '""', delimiter=' ', skipinitialspace=False) + self._write_test([''], '""', delimiter=' ', skipinitialspace=True) + self._write_test([None], '""', delimiter=' ', skipinitialspace=False) + self._write_test([None], '""', delimiter=' ', skipinitialspace=True) + + self._write_test(['', ''], ' ', delimiter=' ', skipinitialspace=False) + self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=True) + self._write_test([None, None], ' ', delimiter=' ', skipinitialspace=False) + self._write_test([None, None], '"" ""', delimiter=' ', skipinitialspace=True) + + self._write_test(['', ''], ' ', delimiter=' ', skipinitialspace=False, + quoting=csv.QUOTE_NONE) + self._write_error_test(csv.Error, ['', ''], + delimiter=' ', skipinitialspace=True, + quoting=csv.QUOTE_NONE) + for quoting in csv.QUOTE_STRINGS, csv.QUOTE_NOTNULL: + self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=False, + quoting=quoting) + self._write_test(['', ''], '"" ""', delimiter=' ', skipinitialspace=True, + quoting=quoting) + + for quoting in csv.QUOTE_NONE, csv.QUOTE_STRINGS, csv.QUOTE_NOTNULL: + self._write_test([None, None], ' ', delimiter=' ', skipinitialspace=False, + quoting=quoting) + self._write_error_test(csv.Error, [None, None], + delimiter=' ', skipinitialspace=True, + quoting=quoting) + def test_writerows_errors(self): with TemporaryFile("w+", encoding="utf-8", newline='') as fileobj: writer = csv.writer(fileobj) @@ -429,6 +467,14 @@ def test_read_skipinitialspace(self): [[None, None, None]], skipinitialspace=True, quoting=csv.QUOTE_STRINGS) + def test_read_space_delimiter(self): + self._read_test(['a b', ' a ', ' ', ''], + [['a', '', '', 'b'], ['', '', 'a', '', ''], ['', '', ''], []], + delimiter=' ', skipinitialspace=False) + self._read_test(['a b', ' a ', ' ', ''], + [['a', 'b'], ['a', ''], [''], []], + delimiter=' ', skipinitialspace=True) + def test_read_bigfield(self): # This exercises the buffer realloc functionality and field size # limits. @@ -555,10 +601,10 @@ class space(csv.excel): escapechar = "\\" with TemporaryFile("w+", encoding="utf-8") as fileobj: - fileobj.write("abc def\nc1ccccc1 benzene\n") + fileobj.write("abc def\nc1ccccc1 benzene\n") fileobj.seek(0) reader = csv.reader(fileobj, dialect=space()) - self.assertEqual(next(reader), ["abc", "def"]) + self.assertEqual(next(reader), ["abc", "", "", "def"]) self.assertEqual(next(reader), ["c1ccccc1", "benzene"]) def compare_dialect_123(self, expected, *writeargs, **kwwriteargs): @@ -1164,8 +1210,9 @@ class mydialect(csv.Dialect): self.assertRaises(csv.Error, create_invalid, field_name, 5) self.assertRaises(ValueError, create_invalid, field_name, "\n") self.assertRaises(ValueError, create_invalid, field_name, "\r") - self.assertRaises(ValueError, create_invalid, field_name, " ", - skipinitialspace=True) + if field_name != "delimiter": + self.assertRaises(ValueError, create_invalid, field_name, " ", + skipinitialspace=True) class TestSniffer(unittest.TestCase): diff --git a/Misc/NEWS.d/next/Library/2024-02-20-16-42-54.gh-issue-115712.EXVMXw.rst b/Misc/NEWS.d/next/Library/2024-02-20-16-42-54.gh-issue-115712.EXVMXw.rst new file mode 100644 index 000000000000000..8b19064dba779d2 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-02-20-16-42-54.gh-issue-115712.EXVMXw.rst @@ -0,0 +1,4 @@ +Restore support of space delimiter with ``skipinitialspace=True`` in +:mod:`csv`. :func:`csv.writer()` now quotes empty fields if delimiter is a +space and skipinitialspace is true and raises exception if quoting is not +possible. diff --git a/Modules/_csv.c b/Modules/_csv.c index 3aa648b8e9cec44..8d0472885afd96d 100644 --- a/Modules/_csv.c +++ b/Modules/_csv.c @@ -332,9 +332,9 @@ dialect_check_quoting(int quoting) } static int -dialect_check_char(const char *name, Py_UCS4 c, DialectObj *dialect) +dialect_check_char(const char *name, Py_UCS4 c, DialectObj *dialect, bool allowspace) { - if (c == '\r' || c == '\n' || (dialect->skipinitialspace && c == ' ')) { + if (c == '\r' || c == '\n' || (c == ' ' && !allowspace)) { PyErr_Format(PyExc_ValueError, "bad %s value", name); return -1; } @@ -535,9 +535,11 @@ dialect_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) PyErr_SetString(PyExc_TypeError, "lineterminator must be set"); goto err; } - if (dialect_check_char("delimiter", self->delimiter, self) || - dialect_check_char("escapechar", self->escapechar, self) || - dialect_check_char("quotechar", self->quotechar, self) || + if (dialect_check_char("delimiter", self->delimiter, self, true) || + dialect_check_char("escapechar", self->escapechar, self, + !self->skipinitialspace) || + dialect_check_char("quotechar", self->quotechar, self, + !self->skipinitialspace) || dialect_check_chars("delimiter", "escapechar", self->delimiter, self->escapechar) || dialect_check_chars("delimiter", "quotechar", @@ -1221,6 +1223,7 @@ join_check_rec_size(WriterObj *self, Py_ssize_t rec_len) static int join_append(WriterObj *self, PyObject *field, int quoted) { + DialectObj *dialect = self->dialect; int field_kind = -1; const void *field_data = NULL; Py_ssize_t field_len = 0; @@ -1231,6 +1234,19 @@ join_append(WriterObj *self, PyObject *field, int quoted) field_data = PyUnicode_DATA(field); field_len = PyUnicode_GET_LENGTH(field); } + if (!field_len && dialect->delimiter == ' ' && dialect->skipinitialspace) { + if (dialect->quoting == QUOTE_NONE || + (field == NULL && + (dialect->quoting == QUOTE_STRINGS || + dialect->quoting == QUOTE_NOTNULL))) + { + PyErr_Format(self->error_obj, + "empty field must be quoted if delimiter is a space " + "and skipinitialspace is true"); + return 0; + } + quoted = 1; + } rec_len = join_append_data(self, field_kind, field_data, field_len, "ed, 0); if (rec_len < 0) @@ -1282,6 +1298,7 @@ csv_writerow(WriterObj *self, PyObject *seq) { DialectObj *dialect = self->dialect; PyObject *iter, *field, *line, *result; + bool null_field = false; iter = PyObject_GetIter(seq); if (iter == NULL) { @@ -1318,11 +1335,12 @@ csv_writerow(WriterObj *self, PyObject *seq) break; } + null_field = (field == Py_None); if (PyUnicode_Check(field)) { append_ok = join_append(self, field, quoted); Py_DECREF(field); } - else if (field == Py_None) { + else if (null_field) { append_ok = join_append(self, NULL, quoted); Py_DECREF(field); } @@ -1348,7 +1366,11 @@ csv_writerow(WriterObj *self, PyObject *seq) return NULL; if (self->num_fields > 0 && self->rec_len == 0) { - if (dialect->quoting == QUOTE_NONE) { + if (dialect->quoting == QUOTE_NONE || + (null_field && + (dialect->quoting == QUOTE_STRINGS || + dialect->quoting == QUOTE_NOTNULL))) + { PyErr_Format(self->error_obj, "single empty field record must be quoted"); return NULL;