csvclean: Use standard output and standard error, and use exit code 1…

… if errors #781 #195
wireservice · Apr 27, 2024 · 8d700e8 · 8d700e8
1 parent d00ea20
commit 8d700e8
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 145 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -1,3 +1,10 @@
+2.0.0 - Unreleased
+------------------
+
+**BACKWARDS-INCOMPATIBLE CHANGES**
+
+* :doc:`/scripts/csvclean` now writes its output to standard output and its errors to standard error, instead of to ``basename_out.csv`` and ``basename_err.csv`` files. Consequently, it no longer supports a :code:`--dry-run` flag to output summary information like ``No errors.``, ``42 errors logged to basename_err.csv`` or ``42 rows were joined/reduced to 24 rows after eliminating expected internal line breaks.``.
+
 1.5.0 - March 28, 2024
 ----------------------
 

diff --git a/csvkit/cleanup.py b/csvkit/cleanup.py
@@ -5,8 +5,10 @@
 
 def join_rows(rows, joiner=' '):
     """
-    Given a series of rows, return them as a single row where the inner edge cells are merged. By default joins with a
-    single space character, but you can specify new-line, empty string, or anything else with the 'joiner' kwarg.
+    Given a series of rows, return them as a single row where the inner edge cells are merged.
+
+    :param joiner:
+        The separator between cells, a single space by default.
     """
     rows = list(rows)
     fixed_row = rows[0][:]
@@ -33,8 +35,6 @@ def __init__(self, reader):
         except StopIteration:
             self.column_names = []
         self.errors = []
-        self.rows_joined = 0
-        self.joins = 0
 
     def checked_rows(self):
         """
@@ -69,9 +69,6 @@ def checked_rows(self):
                             break
 
                         if len(fixed_row) == length:
-                            self.rows_joined += len(joinable_row_errors)
-                            self.joins += 1
-
                             yield fixed_row
 
                             for fixed in joinable_row_errors:

diff --git a/csvkit/cli.py b/csvkit/cli.py
@@ -68,19 +68,26 @@ class CSVKitUtility:
     epilog = ''
     override_flags = ''
 
-    def __init__(self, args=None, output_file=None):
+    def __init__(self, args=None, output_file=None, error_file=None):
         """
         Perform argument processing and other setup for a CSVKitUtility.
         """
         self._init_common_parser()
         self.add_arguments()
         self.args = self.argparser.parse_args(args)
+
         # Output file is only set during testing.
         if output_file is None:
             self.output_file = sys.stdout
         else:
             self.output_file = output_file
 
+        # Error file is only set during testing.
+        if error_file is None:
+            self.error_file = sys.stderr
+        else:
+            self.error_file = error_file
+
         self.reader_kwargs = self._extract_csv_reader_kwargs()
         self.writer_kwargs = self._extract_csv_writer_kwargs()
 

diff --git a/csvkit/utilities/csvclean.py b/csvkit/utilities/csvclean.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 
 import sys
-from os.path import splitext
 
 import agate
 
@@ -14,75 +13,28 @@ class CSVClean(CSVKitUtility):
     override_flags = ['L', 'blanks', 'date-format', 'datetime-format']
 
     def add_arguments(self):
-        self.argparser.add_argument(
-            '-n', '--dry-run', dest='dryrun', action='store_true',
-            help='Do not create output files. Information about what would have been done will be printed to STDERR.')
+        pass
 
     def main(self):
         if self.additional_input_expected():
             sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')
 
         reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
 
-        if self.args.dryrun:
-            checker = RowChecker(reader)
+        checker = RowChecker(reader)
 
-            for _row in checker.checked_rows():
-                pass
+        output_writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
+        output_writer.writerow(checker.column_names)
+        for row in checker.checked_rows():
+            output_writer.writerow(row)
 
-            if checker.errors:
-                for e in checker.errors:
-                    self.output_file.write('Line %i: %s\n' % (e.line_number, e.msg))
-            else:
-                self.output_file.write('No errors.\n')
+        if checker.errors:
+            error_writer = agate.csv.writer(self.error_file, **self.writer_kwargs)
+            error_writer.writerow(['line_number', 'msg'] + checker.column_names)
+            for error in checker.errors:
+                error_writer.writerow([error.line_number, error.msg] + error.row)
 
-            if checker.joins:
-                self.output_file.write('%i rows would have been joined/reduced to %i rows after eliminating expected '
-                                       'internal line breaks.\n' % (checker.rows_joined, checker.joins))
-        else:
-            if self.input_file == sys.stdin:
-                base = 'stdin'  # "<stdin>_out.csv" is invalid on Windows
-            else:
-                base = splitext(self.input_file.name)[0]
-
-            with open(f'{base}_out.csv', 'w') as f:
-                clean_writer = agate.csv.writer(f, **self.writer_kwargs)
-
-                checker = RowChecker(reader)
-                clean_writer.writerow(checker.column_names)
-
-                for row in checker.checked_rows():
-                    clean_writer.writerow(row)
-
-            if checker.errors:
-                error_filename = f'{base}_err.csv'
-
-                with open(error_filename, 'w') as f:
-                    error_writer = agate.csv.writer(f, **self.writer_kwargs)
-
-                    error_header = ['line_number', 'msg']
-                    error_header.extend(checker.column_names)
-                    error_writer.writerow(error_header)
-
-                    error_count = len(checker.errors)
-
-                    for e in checker.errors:
-                        error_writer.writerow(self._format_error_row(e))
-
-                self.output_file.write('%i error%s logged to %s\n' % (
-                    error_count, '' if error_count == 1 else 's', error_filename))
-            else:
-                self.output_file.write('No errors.\n')
-
-            if checker.joins:
-                self.output_file.write('%i rows were joined/reduced to %i rows after eliminating expected internal '
-                                       'line breaks.\n' % (checker.rows_joined, checker.joins))
-
-    def _format_error_row(self, error):
-        row = [error.line_number, error.msg]
-        row.extend(error.row)
-
-        return row
+            sys.exit(1)
 
 
 def launch_new_instance():

diff --git a/docs/scripts/csvclean.rst b/docs/scripts/csvclean.rst
@@ -18,13 +18,13 @@ Note that every csvkit tool does the following:
 * changes the quote character to a double-quotation mark, if the character is set with the `--quotechar` (`-q`) option
 * changes the character encoding to UTF-8, if the input encoding is set with the `--encoding` (`-e`) option
 
-Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions:
+All valid rows are written to standard output, and all error rows along with line numbers and descriptions are written to standard error. If there are error rows, the exit code will be 1::
 
 .. code-block:: none
 
    usage: csvclean [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
                    [-p ESCAPECHAR] [-z FIELD_SIZE_LIMIT] [-e ENCODING] [-S] [-H]
-                   [-K SKIP_LINES] [-v] [-l] [--zero] [-V] [-n]
+                   [-K SKIP_LINES] [-v] [-l] [--zero] [-V]
                    [FILE]
 
    Fix common errors in a CSV file.
@@ -35,8 +35,6 @@ Outputs [basename]_out.csv and [basename]_err.csv, the former containing all val
 
    optional arguments:
      -h, --help            show this help message and exit
-     -n, --dry-run         Do not create output files. Information about what
-                           would have been done will be printed to STDERR.
 
 See also: :doc:`../common_arguments`.
 
@@ -47,9 +45,13 @@ Test a file with known bad rows:
 
 .. code-block:: console
 
-   $ csvclean -n examples/bad.csv
-   Line 1: Expected 3 columns, found 4 columns
-   Line 2: Expected 3 columns, found 2 columns
+   $ csvclean examples/bad.csv 2> errors.csv
+   column_a,column_b,column_c
+   0,mixed types.... uh oh,17
+   $ cat errors.csv
+   line_number,msg,column_a,column_b,column_c
+   1,"Expected 3 columns, found 4 columns",1,27,,I'm too long!
+   2,"Expected 3 columns, found 2 columns",,I'm too short!
 
 To change the line ending from line feed (LF or ``\n``) to carriage return and line feed (CRLF or ``\r\n``) use:
 

diff --git a/tests/test_utilities/test_csvclean.py b/tests/test_utilities/test_csvclean.py
@@ -3,6 +3,8 @@
 import sys
 from unittest.mock import patch
 
+import agate
+
 from csvkit.utilities.csvclean import CSVClean, launch_new_instance
 from tests.utils import CSVKitTestCase, EmptyFileTests
 
@@ -15,98 +17,89 @@ def tearDown(self):
         if os.path.isfile(output_file):
             os.remove(output_file)
 
-    def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
-        args = [f'examples/{basename}.csv'] + additional_args
+    def assertCleaned(self, args, output_rows, error_rows=[]):
         output_file = io.StringIO()
+        error_file = io.StringIO()
 
-        utility = CSVClean(args, output_file)
-        utility.run()
+        utility = CSVClean(args, output_file, error_file)
 
-        output_file.close()
+        if error_rows:
+            with self.assertRaises(SystemExit) as e:
+                utility.run()
+
+            self.assertEqual(e.exception.code, 1)
+        else:
+            utility.run()
+
+        output_file.seek(0)
+        error_file.seek(0)
 
-        output_file = f'examples/{basename}_out.csv'
-        error_file = f'examples/{basename}_err.csv'
-
-        self.assertEqual(os.path.exists(output_file), bool(output_lines))
-        self.assertEqual(os.path.exists(error_file), bool(error_lines))
-
-        try:
-            if output_lines:
-                with open(output_file) as f:
-                    for line in output_lines:
-                        self.assertEqual(next(f), line)
-                    self.assertRaises(StopIteration, next, f)
-            if error_lines:
-                with open(error_file) as f:
-                    for line in error_lines:
-                        self.assertEqual(next(f), line)
-                    self.assertRaises(StopIteration, next, f)
-        finally:
-            if output_lines:
-                os.remove(output_file)
-            if error_lines:
-                os.remove(error_file)
+        if output_rows:
+            reader = agate.csv.reader(output_file)
+            for row in output_rows:
+                self.assertEqual(next(reader), row)
+            self.assertRaises(StopIteration, next, reader)
+        if error_rows:
+            reader = agate.csv.reader(error_file)
+            for row in error_rows:
+                self.assertEqual(next(reader), row)
+            self.assertRaises(StopIteration, next, reader)
+
+        output_file.close()
+        error_file.close()
 
     def test_launch_new_instance(self):
-        with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']):
+        with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/dummy.csv']):
             launch_new_instance()
 
     def test_skip_lines(self):
-        self.assertCleaned('bad_skip_lines', [
-            'column_a,column_b,column_c\n',
-            '0,mixed types.... uh oh,17\n',
+        self.assertCleaned(['--skip-lines', '3', 'examples/bad_skip_lines.csv'], [
+            ['column_a', 'column_b', 'column_c'],
+            ['0', 'mixed types.... uh oh', '17'],
         ], [
-            'line_number,msg,column_a,column_b,column_c\n',
-            '1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
-            '2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
-        ], ['--skip-lines', '3'])
+            ['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
+            ['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
+            ['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
+        ])
 
     def test_simple(self):
-        self.assertCleaned('bad', [
-            'column_a,column_b,column_c\n',
-            '0,mixed types.... uh oh,17\n',
+        self.assertCleaned(['examples/bad.csv'], [
+            ['column_a', 'column_b', 'column_c'],
+            ['0', 'mixed types.... uh oh', '17'],
         ], [
-            'line_number,msg,column_a,column_b,column_c\n',
-            '1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
-            '2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
+            ['line_number', 'msg', 'column_a', 'column_b', 'column_c'],
+            ['1', 'Expected 3 columns, found 4 columns', '1', '27', '', "I'm too long!"],
+            ['2', 'Expected 3 columns, found 2 columns', '', "I'm too short!"],
         ])
 
     def test_no_header_row(self):
-        self.assertCleaned('no_header_row', [
-            '1,2,3\n',
+        self.assertCleaned(['examples/no_header_row.csv'], [
+            ['1', '2', '3'],
         ], [])
 
     def test_removes_optional_quote_characters(self):
-        self.assertCleaned('optional_quote_characters', [
-            'a,b,c\n',
-            '1,2,3\n',
-        ], [])
+        self.assertCleaned(['examples/optional_quote_characters.csv'], [
+            ['a', 'b', 'c'],
+            ['1', '2', '3'],
+        ])
 
     def test_changes_line_endings(self):
-        self.assertCleaned('mac_newlines', [
-            'a,b,c\n',
-            '1,2,3\n',
-            '"Once upon\n',
-            'a time",5,6\n',
-        ], [])
+        self.assertCleaned(['examples/mac_newlines.csv'], [
+            ['a', 'b', 'c'],
+            ['1', '2', '3'],
+            ['Once upon\na time', '5', '6'],
+        ])
 
     def test_changes_character_encoding(self):
-        self.assertCleaned('test_latin1', [
-            'a,b,c\n',
-            '1,2,3\n',
-            '4,5,©\n',
-        ], [], ['-e', 'latin1'])
+        self.assertCleaned(['-e', 'latin1', 'examples/test_latin1.csv'], [
+            ['a', 'b', 'c'],
+            ['1', '2', '3'],
+            ['4', '5', u'©'],
+        ])
 
     def test_removes_bom(self):
-        self.assertCleaned('test_utf8_bom', [
-            'foo,bar,baz\n',
-            '1,2,3\n',
-            '4,5,ʤ\n',
-        ], [], [])
-
-    def test_dry_run(self):
-        output = self.get_output_as_io(['-n', 'examples/bad.csv'])
-        self.assertFalse(os.path.exists('examples/bad_err.csv'))
-        self.assertFalse(os.path.exists('examples/bad_out.csv'))
-        self.assertEqual(next(output)[:6], 'Line 1')
-        self.assertEqual(next(output)[:6], 'Line 2')
+        self.assertCleaned(['examples/test_utf8_bom.csv'], [
+            ['foo', 'bar', 'baz'],
+            ['1', '2', '3'],
+            ['4', '5', 'ʤ'],
+        ])