Skip to content

Commit

Permalink
Merge eb86e1c into 120d7c9
Browse files Browse the repository at this point in the history
  • Loading branch information
lcorbasson committed Sep 4, 2020
2 parents 120d7c9 + eb86e1c commit f9272e2
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 2 deletions.
35 changes: 33 additions & 2 deletions csvkit/utilities/csvformat.py
@@ -1,5 +1,8 @@
#!/usr/bin/env python

import csv
from decimal import Decimal
import io
import sys

import agate
Expand Down Expand Up @@ -49,9 +52,37 @@ def main(self):
if self.additional_input_expected():
sys.stderr.write('No input file or piped data provided. Waiting for standard input:\n')

reader = agate.csv.reader(self.skip_lines(), **self.reader_kwargs)
input_file = self.skip_lines()

# When using -U 2 (QUOTE_NONNUMERIC), we have to know which columns are numeric in the input file, to avoid quoting them in the output.
# If the input file is not in the same QUOTE_NONNUMERIC quoting format, the reader cannot determine which columns are numeric;
# we'll have to lend it a hand, but it will be much slower and memory-consuming, so we make this a special case.
detect_numeric_columns = False
numeric_columns = []
if 'quoting' in self.writer_kwargs and self.writer_kwargs['quoting'] == csv.QUOTE_NONNUMERIC:
if 'quoting' not in self.reader_kwargs or self.reader_kwargs['quoting'] != csv.QUOTE_NONNUMERIC:
detect_numeric_columns = True

# Find out which columns are numeric if this is required
if detect_numeric_columns:
input_data = input_file.read() # we need to cache the file's contents to use it twice
input_file = io.StringIO(input_data)
table = agate.Table.from_csv(input_file, **self.reader_kwargs)
numeric_columns = [n for n in range(0, len(table.column_types)) if isinstance(table.column_types[n], agate.Number)]
input_file = io.StringIO(input_data) # reload it from the cache for use by the csv reader

# Read and write CSV
reader = agate.csv.reader(input_file, **self.reader_kwargs)
writer = agate.csv.writer(self.output_file, **self.writer_kwargs)
writer.writerows(reader)
if detect_numeric_columns:
# Special case where we need to convert numeric columns from strings to decimals
if 'header' not in self.reader_kwargs or self.reader_kwargs['header']:
writer.writerow(next(reader))
for row in reader:
writer.writerow([Decimal(row[n]) if n in numeric_columns and row[n] != '' else row[n] for n in range(0, len(row))])
else:
# The usual and much quicker case: pipe from the reader to the writer
writer.writerows(reader)


def launch_new_instance():
Expand Down
24 changes: 24 additions & 0 deletions tests/test_utilities/test_csvformat.py
Expand Up @@ -83,6 +83,30 @@ def test_escapechar(self):

input_file.close()

def test_out_quoting(self):
input_file = six.StringIO('4,b,6,d\na,2,3,d\n8,9,,z\n')

with stdin_as_string(input_file):
self.assertLines(['-U', '2'], [
'"4","b","6","d"',
'"a",2,3,"d"',
'"8",9,"","z"',
])

input_file.close()

def test_out_quoting_no_header_row(self):
input_file = six.StringIO('4,5,6,7\na,2,3,d\n8,9,,z\n')

with stdin_as_string(input_file):
self.assertLines(['-U', '2', '--no-header-row'], [
'"4",5,6,"7"',
'"a",2,3,"d"',
'"8",9,"","z"',
])

input_file.close()

def test_lineterminator(self):
self.assertLines(['-M', 'XYZ', 'examples/dummy.csv'], [
'a,b,cXYZ1,2,3XYZ',
Expand Down

0 comments on commit f9272e2

Please sign in to comment.