Merge eb86e1c into 120d7c9

wireservice · Sep 4, 2020 · f9272e2 · f9272e2
2 parents 120d7c9 + eb86e1c
commit f9272e2
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 2 deletions.
diff --git a/csvkit/utilities/csvformat.py b/csvkit/utilities/csvformat.py
@@ -1,5 +1,8 @@
 #!/usr/bin/env python
 
+import csv
+from decimal import Decimal
+import io
 import sys
 
 import agate
@@ -49,9 +52,37 @@ def main(self):
         if self.additional_input_expected():
             sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')
 
-        reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
+        input_file = self.skip_lines()
+
+        # When using -U 2 (QUOTE_NONNUMERIC), we have to know which columns are numeric in the input file, to avoid quoting them in the output.
+        # If the input file is not in the same QUOTE_NONNUMERIC quoting format, the reader cannot determine which columns are numeric;
+        # we'll have to lend it a hand, but it will be much slower and memory-consuming, so we make this a special case.
+        detect_numeric_columns = False
+        numeric_columns = []
+        if 'quoting' in self.writer_kwargs and self.writer_kwargs['quoting'] == csv.QUOTE_NONNUMERIC:
+            if 'quoting' not in self.reader_kwargs or self.reader_kwargs['quoting'] != csv.QUOTE_NONNUMERIC:
+                detect_numeric_columns = True
+
+        # Find out which columns are numeric if this is required
+        if detect_numeric_columns:
+            input_data = input_file.read() # we need to cache the file's contents to use it twice
+            input_file = io.StringIO(input_data)
+            table = agate.Table.from_csv(input_file, **self.reader_kwargs)
+            numeric_columns = [n for n in range(0, len(table.column_types)) if isinstance(table.column_types[n], agate.Number)]
+            input_file = io.StringIO(input_data) # reload it from the cache for use by the csv reader
+
+        # Read and write CSV
+        reader = agate.csv.reader(input_file, **self.reader_kwargs)
         writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
-        writer.writerows(reader)
+        if detect_numeric_columns:
+            # Special case where we need to convert numeric columns from strings to decimals
+            if 'header' not in self.reader_kwargs or self.reader_kwargs['header']:
+                writer.writerow(next(reader))
+            for row in reader:
+                writer.writerow([Decimal(row[n]) if n in numeric_columns and row[n] != '' else row[n] for n in range(0, len(row))])
+        else:
+            # The usual and much quicker case: pipe from the reader to the writer
+            writer.writerows(reader)
 
 
 def launch_new_instance():

diff --git a/tests/test_utilities/test_csvformat.py b/tests/test_utilities/test_csvformat.py
@@ -83,6 +83,30 @@ def test_escapechar(self):
 
         input_file.close()
 
+    def test_out_quoting(self):
+        input_file = six.StringIO('4,b,6,d\na,2,3,d\n8,9,,z\n')
+
+        with stdin_as_string(input_file):
+            self.assertLines(['-U', '2'], [
+                '"4","b","6","d"',
+                '"a",2,3,"d"',
+                '"8",9,"","z"',
+            ])
+
+        input_file.close()
+
+    def test_out_quoting_no_header_row(self):
+        input_file = six.StringIO('4,5,6,7\na,2,3,d\n8,9,,z\n')
+
+        with stdin_as_string(input_file):
+            self.assertLines(['-U', '2', '--no-header-row'], [
+                '"4",5,6,"7"',
+                '"a",2,3,"d"',
+                '"8",9,"","z"',
+            ])
+
+        input_file.close()
+
     def test_lineterminator(self):
         self.assertLines(['-M', 'XYZ', 'examples/dummy.csv'], [
             'a,b,cXYZ1,2,3XYZ',