From 453a864306488edf27fad436906315d97e2d1f82 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Sat, 30 Jan 2016 15:56:29 -0500 Subject: [PATCH 1/4] Trim down csvkit.table.Table --- csvkit/table.py | 88 -------------------------------------- tests/test_table.py | 101 -------------------------------------------- 2 files changed, 189 deletions(-) diff --git a/csvkit/table.py b/csvkit/table.py index d11052b93..1ade85fa3 100644 --- a/csvkit/table.py +++ b/csvkit/table.py @@ -104,68 +104,6 @@ def __init__(self, columns=[], name='new_table'): list.__init__(self, columns) self.name = name - def __str__(self): - return str(self.__unicode__()) - - def __unicode__(self): - """ - Stringify a description of all columns in this table. - """ - return '\n'.join([six.text_type(c) for c in self]) - - def _reindex_columns(self): - """ - Update order properties of all columns in table. - """ - for i, c in enumerate(self): - c.order = i - - def _deduplicate_column_name(self, column): - while column.name in self.headers(): - try: - i = column.name.rindex('_') - counter = int(column.name[i + 1:]) - column.name = '%s_%i' % (column.name[:i], counter + 1) - except: - column.name += '_2' - - return column.name - - def append(self, column): - """Implements list append.""" - self._deduplicate_column_name(column) - - list.append(self, column) - column.index = len(self) - 1 - - def insert(self, i, column): - """Implements list insert.""" - self._deduplicate_column_name(column) - - list.insert(self, i, column) - self._reindex_columns() - - def extend(self, columns): - """Implements list extend.""" - for c in columns: - self._deduplicate_column_name(c) - - list.extend(self, columns) - self._reindex_columns() - - def remove(self, column): - """Implements list remove.""" - list.remove(self, column) - self._reindex_columns() - - def sort(self): - """Forbids list sort.""" - raise NotImplementedError() - - def reverse(self): - """Forbids list reverse.""" - raise NotImplementedError() - def headers(self): return [c.name for c in self] @@ -177,20 +115,6 @@ def count_rows(self): return 0 - def row(self, i): - """ - Fetch a row of data from this table. - """ - if i < 0: - raise IndexError('Negative row numbers are not valid.') - - if i >= self.count_rows(): - raise IndexError('Row number exceeds the number of rows in the table.') - - row_data = [c[i] for c in self] - - return row_data - @classmethod def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, blanks_as_nulls=True, zero_based=False, infer_types=True, no_header_row=False, **kwargs): """ @@ -281,15 +205,3 @@ def to_rows(self, serialize_dates=False): return list(zip(*out_columns)) else: return list(zip(*self)) - - def to_csv(self, output, **kwargs): - """ - Serializes the table to CSV and writes it to any file-like object. - """ - rows = self.to_rows(serialize_dates=True) - - # Insert header row - rows.insert(0, self.headers()) - - csv_writer = agate.writer(output, **kwargs) - csv_writer.writerows(rows) diff --git a/tests/test_table.py b/tests/test_table.py index ea702e499..638b3d706 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -99,82 +99,6 @@ def test_from_csv_dev_null(self): with open('/dev/null', 'r') as f: table.Table.from_csv(f) - def test_to_csv(self): - raise SkipTest - with open('examples/testfixed_converted.csv', 'r') as f: - contents = f.read() - f.seek(0) - o = six.StringIO() - table.Table.from_csv(f).to_csv(o) - conversion = o.getvalue() - o.close() - - self.assertEqual(contents, conversion) - - def test_table_append(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table() - t.append(c) - self.assertEqual(len(t), 1) - self.assertEqual(t[0], c) - - def test_table_append_duplicate_name(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - c3 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table() - t.append(c) - t.append(c2) - t.append(c3) - self.assertEqual(t[0].name, 'test') - self.assertEqual(t[1].name, 'test_2') - self.assertEqual(t[2].name, 'test_3') - - def test_table_insert(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table([c]) - t.insert(0, c2) - self.assertEqual(len(t), 2) - self.assertEqual(t[0], c2) - self.assertEqual(t[1], c) - self.assertEqual(t[0].order, 0) - self.assertEqual(t[1].order, 1) - - def test_table_extend(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - c3 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table([c]) - t.extend([c2, c3]) - self.assertEqual(len(t), 3) - self.assertEqual(t[0], c) - self.assertEqual(t[1], c2) - self.assertEqual(t[2], c3) - self.assertEqual(t[0].order, 0) - self.assertEqual(t[1].order, 1) - self.assertEqual(t[2].order, 2) - - def test_table_remove(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - c3 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table([c, c2, c3]) - t.remove(c2) - self.assertEqual(len(t), 2) - self.assertEqual(t[0], c) - self.assertEqual(t[1], c3) - self.assertEqual(t[0].order, 0) - self.assertEqual(t[1].order, 1) - - def test_table_sort(self): - t = table.Table() - self.assertRaises(NotImplementedError, t.sort) - - def test_table_reverse(self): - t = table.Table() - self.assertRaises(NotImplementedError, t.reverse) - def test_table_count_rows(self): c = table.Column(0, u'test', [u'test', u'column', u'']) c_short = table.Column(0, u'test', [u'test']) @@ -187,28 +111,3 @@ def test_table_count_rows(self): self.assertEqual(t.count_rows(), 3) t.append(c_long) self.assertEqual(t.count_rows(), 4) - - def test_table_row(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - c3 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table([c, c2, c3]) - self.assertEqual(t.row(1), [u'column', u'column', u'column']) - - def test_table_row_out_of_bounds(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c2 = table.Column(0, u'test', [u'test', u'column', u'']) - c3 = table.Column(0, u'test', [u'test', u'column', u'']) - t = table.Table([c, c2, c3]) - self.assertRaises(IndexError, t.row, -1) - self.assertRaises(IndexError, t.row, 3) - - def test_table_uneven_columns(self): - c = table.Column(0, u'test', [u'test', u'column', u'']) - c_short = table.Column(0, u'test', [u'test']) - c_long = table.Column(0, u'test', [u'', u'', u'', u'way out here']) - t = table.Table([c, c_short, c_long]) - self.assertEqual(t.row(0), [u'test', u'test', None]) - self.assertEqual(t.row(1), [u'column', None, None]) - self.assertEqual(t.row(2), [None, None, None]) - self.assertEqual(t.row(3), [None, None, u'way out here']) From 1b81ab696a3c556dde3d67c35295d639769c34e2 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Sat, 30 Jan 2016 16:00:29 -0500 Subject: [PATCH 2/4] Trim down csvkit.typeinference --- csvkit/typeinference.py | 46 -------------------------- tests/test_typeinference.py | 65 ------------------------------------- 2 files changed, 111 deletions(-) diff --git a/csvkit/typeinference.py b/csvkit/typeinference.py index 3b01743a2..9d713fc57 100644 --- a/csvkit/typeinference.py +++ b/csvkit/typeinference.py @@ -201,49 +201,3 @@ def normalize_column_type(l, normal_type=None, blanks_as_nulls=True): return six.text_type, [x if x != '' else None for x in l] else: return six.text_type, l - - -def normalize_table(rows, normal_types=None, accumulate_errors=False, blanks_as_nulls=True): - """ - Given a sequence of sequences, normalize the lot. - - Optionally accepts a normal_types parameter which is a list of - types that the columns must normalize to. - """ - data_columns = [] - column_count = 0 - row_count = 0 - - for row in rows: - while column_count < len(row): - data_columns.append([None] * row_count) - column_count += 1 - - for i, value in enumerate(row): - data_columns[i].append(value) - - row_count += 1 - - new_normal_types = [] - new_normal_columns = [] - errors = {} - - for i, column in enumerate(data_columns): - try: - if normal_types: - t, c = normalize_column_type(column, normal_types[i], blanks_as_nulls=blanks_as_nulls) - else: - t, c = normalize_column_type(column, blanks_as_nulls=blanks_as_nulls) - - new_normal_types.append(t) - new_normal_columns.append(c) - except InvalidValueForTypeException as e: - if not accumulate_errors: - raise - - errors[i] = e - - if errors: - raise InvalidValueForTypeListException(errors) - - return new_normal_types, new_normal_columns diff --git a/tests/test_typeinference.py b/tests/test_typeinference.py index 14686790d..c4acee326 100644 --- a/tests/test_typeinference.py +++ b/tests/test_typeinference.py @@ -187,68 +187,3 @@ def test_jeremy_singer_vine_datetimes(self): This obscure test named after Jeremy Singer-Vine, who discovered it. """ self.assertEqual((six.text_type, [u'P', u'H', u'H']), typeinference.normalize_column_type([u'P', u'H', u'H'])) - - def test_normalize_table(self): - expected_types = [six.text_type, int, float, NoneType] - data = [ - [u'a', u'1', u'2.1', u''], - [u'b', u'5', u'4.1'], - [u'c', u'100', u'100.9999', u''], - [u'd', u'2', u'5.3', u''] - ] - types, columns = typeinference.normalize_table(data) - - self.assertEqual(4, len(types)) - self.assertEqual(4, len(columns)) - - for i, tup in enumerate(zip(columns, types, expected_types)): - c, t, et = tup - self.assertEqual(et, t) - for row, normalized in zip(data, c): - if t is NoneType: - self.assertTrue(normalized is None) - else: - self.assertEqual(t(row[i]), normalized) - - def test_normalize_table_known_types(self): - normal_types = [six.text_type, int, float, NoneType] - data = [ - [u'a', u'1', u'2.1', u''], - [u'b', u'5', u'4.1'], - [u'c', u'100', u'100.9999', u''], - [u'd', u'2', u'5.3', u''] - ] - types, columns = typeinference.normalize_table(data, normal_types) - - self.assertEqual(4, len(types)) - self.assertEqual(4, len(columns)) - - for i, tup in enumerate(zip(columns, types, normal_types)): - c, t, et = tup - self.assertEqual(et, t) - for row, normalized in zip(data, c): - if t is NoneType: - self.assertTrue(normalized is None) - else: - self.assertEqual(t(row[i]), normalized) - - def test_normalize_table_known_types_invalid(self): - normal_types = [bool, int, int, NoneType] - data = [ - [u'a', u'1', u'2.1', u''], - [u'b', u'5', u'4.1'], - [u'c', u'100', u'100.9999', u''], - [u'd', u'2', u'5.3', u''] - ] - - try: - typeinference.normalize_table(data, normal_types, accumulate_errors=True) - self.assertEqual(True, False) - except InvalidValueForTypeListException as e: - self.assertEqual(len(e.errors), 2) - self.assertEqual(e.errors[0].index, 0) - self.assertEqual(e.errors[0].value, 'a') - self.assertEqual(e.errors[0].normal_type, bool) - self.assertEqual(e.errors[2].index, 0) - self.assertEqual(e.errors[2].value, '2.1') - self.assertEqual(e.errors[2].normal_type, int) From fe2b0926825f2275469b671b94efb020be500f3a Mon Sep 17 00:00:00 2001 From: James McKinney Date: Sat, 30 Jan 2016 16:05:16 -0500 Subject: [PATCH 3/4] Remove unused exceptions --- csvkit/exceptions.py | 19 ------------------- csvkit/typeinference.py | 2 +- tests/test_typeinference.py | 2 +- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/csvkit/exceptions.py b/csvkit/exceptions.py index e26c5e6a2..dcf4ce3b2 100644 --- a/csvkit/exceptions.py +++ b/csvkit/exceptions.py @@ -49,13 +49,6 @@ def length(self): return len(self.row) -class CSVJSONException(CustomException): - """ - Exception raised when there is a problem converting data to CSV. - """ - pass - - class InvalidValueForTypeException(CustomException): """ Exception raised when a value can not be normalized to a specified type. @@ -69,18 +62,6 @@ def __init__(self, index, value, normal_type): super(InvalidValueForTypeException, self).__init__(msg) -class InvalidValueForTypeListException(CustomException): - """ - Exception raised when one or more InvalidValueForTypeException - has been raised while accumulating errors. - """ - - def __init__(self, errors): - self.errors = errors - msg = 'Encountered errors converting values in %i columns' % len(errors) - super(InvalidValueForTypeListException, self).__init__(msg) - - class RequiredHeaderError(CustomException): """ Exception raised when an operation requires a CSV file to have a header row. diff --git a/csvkit/typeinference.py b/csvkit/typeinference.py index 9d713fc57..798275c6e 100644 --- a/csvkit/typeinference.py +++ b/csvkit/typeinference.py @@ -5,7 +5,7 @@ from dateutil.parser import parse import six -from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException +from csvkit.exceptions import InvalidValueForTypeException NoneType = type(None) diff --git a/tests/test_typeinference.py b/tests/test_typeinference.py index c4acee326..b00095434 100644 --- a/tests/test_typeinference.py +++ b/tests/test_typeinference.py @@ -11,7 +11,7 @@ from csvkit import typeinference -from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException +from csvkit.exceptions import InvalidValueForTypeException NoneType = type(None) From 732e8a44a7bcfd74b0c4e3e813be97574d0c3261 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Sat, 30 Jan 2016 16:08:06 -0500 Subject: [PATCH 4/4] Consolidate deprecated behavior --- csvkit/headers.py | 8 -------- csvkit/sniffer.py | 18 ------------------ csvkit/table.py | 29 +++++++++++++++++++++++++---- csvkit/utilities/csvcut.py | 2 +- csvkit/utilities/csvgrep.py | 2 +- csvkit/utilities/csvlook.py | 2 +- csvkit/utilities/csvstack.py | 2 +- tests/test_table.py | 2 -- 8 files changed, 29 insertions(+), 36 deletions(-) delete mode 100644 csvkit/headers.py delete mode 100644 csvkit/sniffer.py diff --git a/csvkit/headers.py b/csvkit/headers.py deleted file mode 100644 index 9f3066f57..000000000 --- a/csvkit/headers.py +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python - - -def make_default_headers(n): - """ - Make a set of simple, default headers for files that are missing them. - """ - return ['column%i' % (i + 1) for i in range(n)] diff --git a/csvkit/sniffer.py b/csvkit/sniffer.py deleted file mode 100644 index 01947ab9f..000000000 --- a/csvkit/sniffer.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python - -import csv - -POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|'] - - -def sniff_dialect(sample): - """ - A functional version of ``csv.Sniffer().sniff``, that extends the - list of possible delimiters to include some seen in the wild. - """ - try: - dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS) - except: - dialect = None - - return dialect diff --git a/csvkit/table.py b/csvkit/table.py index 1ade85fa3..cfc70d9dc 100644 --- a/csvkit/table.py +++ b/csvkit/table.py @@ -1,15 +1,36 @@ #!/usr/bin/env python +import csv import datetime import itertools import agate import six -from csvkit import sniffer from csvkit import typeinference from csvkit.cli import parse_column_identifiers -from csvkit.headers import make_default_headers + +POSSIBLE_DELIMITERS = [',', '\t', ';', ' ', ':', '|'] + + +def make_default_headers(n): + """ + Make a set of simple, default headers for files that are missing them. + """ + return ['column%i' % (i + 1) for i in range(n)] + + +def sniff_dialect(sample): + """ + A functional version of ``csv.Sniffer().sniff``, that extends the + list of possible delimiters to include some seen in the wild. + """ + try: + dialect = csv.Sniffer().sniff(sample, POSSIBLE_DELIMITERS) + except: + dialect = None + + return dialect class InvalidType(object): @@ -131,9 +152,9 @@ def from_csv(cls, f, name='from_csv_table', snifflimit=None, column_ids=None, bl # snifflimit == 0 means do not sniff if snifflimit is None: - kwargs['dialect'] = sniffer.sniff_dialect(contents) + kwargs['dialect'] = sniff_dialect(contents) elif snifflimit > 0: - kwargs['dialect'] = sniffer.sniff_dialect(contents[:snifflimit]) + kwargs['dialect'] = sniff_dialect(contents[:snifflimit]) f = six.StringIO(contents) rows = agate.reader(f, **kwargs) diff --git a/csvkit/utilities/csvcut.py b/csvkit/utilities/csvcut.py index 3330d2d9e..c6fda27d9 100644 --- a/csvkit/utilities/csvcut.py +++ b/csvkit/utilities/csvcut.py @@ -14,7 +14,7 @@ import agate from csvkit.cli import CSVKitUtility, parse_column_identifiers -from csvkit.headers import make_default_headers +from csvkit.table import make_default_headers class CSVCut(CSVKitUtility): diff --git a/csvkit/utilities/csvgrep.py b/csvkit/utilities/csvgrep.py index 0307460a5..fa4198660 100644 --- a/csvkit/utilities/csvgrep.py +++ b/csvkit/utilities/csvgrep.py @@ -8,7 +8,7 @@ from csvkit.cli import CSVKitUtility, parse_column_identifiers from csvkit.grep import FilteringCSVReader -from csvkit.headers import make_default_headers +from csvkit.table import make_default_headers class CSVGrep(CSVKitUtility): diff --git a/csvkit/utilities/csvlook.py b/csvkit/utilities/csvlook.py index 819af87b1..c1be44aef 100644 --- a/csvkit/utilities/csvlook.py +++ b/csvkit/utilities/csvlook.py @@ -6,7 +6,7 @@ import six from csvkit.cli import CSVKitUtility -from csvkit.headers import make_default_headers +from csvkit.table import make_default_headers class CSVLook(CSVKitUtility): diff --git a/csvkit/utilities/csvstack.py b/csvkit/utilities/csvstack.py index 1681b057d..c3e26f05a 100644 --- a/csvkit/utilities/csvstack.py +++ b/csvkit/utilities/csvstack.py @@ -5,7 +5,7 @@ import agate from csvkit.cli import CSVKitUtility -from csvkit.headers import make_default_headers +from csvkit.table import make_default_headers class CSVStack(CSVKitUtility): diff --git a/tests/test_table.py b/tests/test_table.py index 638b3d706..ac62a835b 100644 --- a/tests/test_table.py +++ b/tests/test_table.py @@ -6,10 +6,8 @@ try: import unittest2 as unittest - from unittest2.case import SkipTest except ImportError: import unittest - from unittest.case import SkipTest from csvkit import table