Skip to content

Commit

Permalink
Improve documentation for csvclean, closes #748
Browse files Browse the repository at this point in the history
  • Loading branch information
James McKinney committed Jan 17, 2017
1 parent f1180b3 commit 86a4b88
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 22 deletions.
9 changes: 8 additions & 1 deletion docs/scripts/csvclean.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@ csvclean
Description
===========

Cleans a CSV file of common syntax errors. Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions::
Cleans a CSV file of common syntax errors:

* reports rows that have a different number of columns than the header row
* removes optional quote characters
* changes the record delimiter to a line feed
* changes the character encoding to UTF-8

Outputs [basename]_out.csv and [basename]_err.csv, the former containing all valid rows and the latter containing all error rows along with line numbers and descriptions::

usage: csvclean [-h] [-d DELIMITER] [-t] [-q QUOTECHAR] [-u {0,1,2,3}] [-b]
[-p ESCAPECHAR] [-z MAXFIELDSIZE] [-e ENCODING] [-S] [-v] [-l]
Expand Down
2 changes: 2 additions & 0 deletions examples/optional_quote_characters.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
a,b,c
"1","2","3"
78 changes: 57 additions & 21 deletions tests/test_utilities/test_csvclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,37 +17,73 @@
class TestCSVClean(CSVKitTestCase, EmptyFileTests):
Utility = CSVClean

def test_launch_new_instance(self):
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']):
launch_new_instance()

def test_simple(self):
args = ['examples/bad.csv']
def assertCleaned(self, basename, output_lines, error_lines, additional_args=[]):
args = ['examples/%s.csv' % basename] + additional_args
output_file = six.StringIO()

utility = CSVClean(args, output_file)
utility.run()

output_file.close()

self.assertTrue(os.path.exists('examples/bad_err.csv'))
self.assertTrue(os.path.exists('examples/bad_out.csv'))
output_file = 'examples/%s_out.csv' % basename
error_file = 'examples/%s_err.csv' % basename

self.assertEqual(os.path.exists(output_file), bool(output_lines))
self.assertEqual(os.path.exists(error_file), bool(error_lines))

try:
with open('examples/bad_err.csv') as f:
next(f)
self.assertEqual(next(f)[0], '1')
self.assertEqual(next(f)[0], '2')
self.assertRaises(StopIteration, next, f)

with open('examples/bad_out.csv') as f:
next(f)
self.assertEqual(next(f)[0], '0')
self.assertRaises(StopIteration, next, f)
if output_lines:
with open(output_file) as f:
for line in output_lines:
self.assertEqual(next(f), line)
self.assertRaises(StopIteration, next, f)
if error_lines:
with open(error_file) as f:
for line in error_lines:
self.assertEqual(next(f), line)
self.assertRaises(StopIteration, next, f)
finally:
# Cleanup
os.remove('examples/bad_err.csv')
os.remove('examples/bad_out.csv')
if output_lines:
os.remove(output_file)
if error_lines:
os.remove(error_file)


def test_launch_new_instance(self):
with patch.object(sys, 'argv', [self.Utility.__name__.lower(), 'examples/bad.csv']):
launch_new_instance()

def test_simple(self):
self.assertCleaned('bad', [
'column_a,column_b,column_c\n',
'0,mixed types.... uh oh,17\n',
], [
'line_number,msg,column_a,column_b,column_c\n',
'1,"Expected 3 columns, found 4 columns",1,27,,I\'m too long!\n',
'2,"Expected 3 columns, found 2 columns",,I\'m too short!\n',
])

def test_removes_optional_quote_characters(self):
self.assertCleaned('optional_quote_characters', [
'a,b,c\n',
'1,2,3\n',
], [])

def test_changes_line_endings(self):
self.assertCleaned('mac_newlines', [
'a,b,c\n',
'1,2,3\n',
'"Once upon\n',
'a time",5,6\n',
], [])

def test_changes_character_encoding(self):
self.assertCleaned('test_latin1', [
'a,b,c\n',
'1,2,3\n',
'4,5,©\n',
], [], ['-e', 'latin1'])

def test_dry_run(self):
output = self.get_output_as_io(['-n', 'examples/bad.csv'])
Expand Down

0 comments on commit 86a4b88

Please sign in to comment.