Skip to content

Commit

Permalink
First pass at accumulating errors when normalizing a table (PANDA).
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Sep 30, 2011
1 parent b12f9dd commit 4b36724
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 18 deletions.
11 changes: 11 additions & 0 deletions csvkit/exceptions.py
Expand Up @@ -74,3 +74,14 @@ def __init__(self, index, value, normal_type):
self.normal_type = normal_type
msg = 'Unable to convert "%s" to type %s (at index %i)' % (value, normal_type, index)
super(InvalidValueForTypeException, self).__init__(msg)

class InvalidValueForTypeListException(CustomException):
"""
Exception raised when one or more InvalidValueForTypeException
has been raised while accumulating errors.
"""
def __init__(self, errors):
self.errors = errors
msg = 'Encountered errors converting values in %i columns' % len(errors)
super(InvalidValueForTypeListException, self).__init__(msg)

45 changes: 35 additions & 10 deletions csvkit/typeinference.py
Expand Up @@ -5,7 +5,7 @@

from dateutil.parser import parse

from exceptions import InvalidValueForTypeException
from exceptions import InvalidValueForTypeException, InvalidValueForTypeListException

NULL_VALUES = ('na', 'n/a', 'none', 'null', '.')
TRUE_VALUES = ('yes', 'y', 'true', 't')
Expand Down Expand Up @@ -171,23 +171,48 @@ def normalize_column_type(l, normal_type=None):
# Don't know what they are, so they must just be strings
return unicode, [x if x != '' else None for x in l]

def normalize_table(rows, column_count):
def normalize_table(rows, normal_types=None, accumulate_errors=False):
"""
Given a sequence of sequences, normalize the lot.
Optionally accepts a normal_types parameter which is a list of
types that the columns must normalize to.
"""
data_columns = [[] for x in range(column_count)]
data_columns = []
column_count = 0
row_count = 0

for row in rows:
while column_count < len(row):
data_columns.append([None] * row_count)
column_count += 1

for data_column, value in zip(data_columns, row):
data_column.append(value)

normal_types = []
normal_columns= []
row_count += 1

new_normal_types = []
new_normal_columns= []
errors = {}

for column in data_columns:
t, c = normalize_column_type(column)
normal_types.append(t)
normal_columns.append(c)
for i, column in enumerate(data_columns):
try:
if normal_types:
t, c = normalize_column_type(column, normal_types[i])
else:
t, c = normalize_column_type(column)

new_normal_types.append(t)
new_normal_columns.append(c)
except InvalidValueForTypeException, e:
if not accumulate_errors:
raise

errors[i] = e

return normal_types, normal_columns
if errors:
raise InvalidValueForTypeListException(errors)

return new_normal_types, new_normal_columns

56 changes: 48 additions & 8 deletions tests/test_typeinference.py
Expand Up @@ -6,7 +6,7 @@

from csvkit import typeinference

from csvkit.exceptions import InvalidValueForTypeException
from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException

class TestNormalizeType(unittest.TestCase):
def test_nulls(self):
Expand Down Expand Up @@ -151,28 +151,68 @@ def test_datetimes_and_dates_coerce(self):
def test_datetimes_and_times(self):
self.assertEqual((unicode, ['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', None]), typeinference.normalize_column_type(['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', '']))


def test_normalize_table(self):
expected_types = [unicode, int, float, NoneType]
data = [
['a','1','2.1', ''],
['b', '5', '4.1', ''],
['b', '5', '4.1'],
['c', '100', '100.9999', ''],
['d', '2', '5.3', '']
]
column_count = len(expected_types)
types, columns = typeinference.normalize_table(data, column_count)
types, columns = typeinference.normalize_table(data)

self.assertEqual(column_count, len(types))
self.assertEqual(column_count, len(columns))
self.assertEqual(4, len(types))
self.assertEqual(4, len(columns))

for i, tup in enumerate(zip(columns, types, expected_types)):
c, t, et = tup
self.assertEqual(et, t)
for row, normalized in zip(data, c):
if t is NoneType:
self.assertTrue(normalized is None)
self.assertEqual('', row[i])
else:
self.assertEqual(t(row[i]), normalized)

def test_normalize_table_known_types(self):
normal_types = [unicode, int, float, NoneType]
data = [
['a','1','2.1', ''],
['b', '5', '4.1'],
['c', '100', '100.9999', ''],
['d', '2', '5.3', '']
]
types, columns = typeinference.normalize_table(data, normal_types)

self.assertEqual(4, len(types))
self.assertEqual(4, len(columns))

for i, tup in enumerate(zip(columns, types, normal_types)):
c, t, et = tup
self.assertEqual(et, t)
for row, normalized in zip(data, c):
if t is NoneType:
self.assertTrue(normalized is None)
else:
self.assertEqual(t(row[i]), normalized)

def test_normalize_table_known_types_invalid(self):
normal_types = [bool, int, int, NoneType]
data = [
['a','1','2.1', ''],
['b', '5', '4.1'],
['c', '100', '100.9999', ''],
['d', '2', '5.3', '']
]

try:
typeinference.normalize_table(data, normal_types, accumulate_errors=True)
self.assertEqual(True, False)
except InvalidValueForTypeListException, e:
self.assertEqual(len(e.errors), 2)
self.assertEqual(e.errors[0].index, 0)
self.assertEqual(e.errors[0].value, 'a')
self.assertEqual(e.errors[0].normal_type, bool)
self.assertEqual(e.errors[2].index, 0)
self.assertEqual(e.errors[2].value, '2.1')
self.assertEqual(e.errors[2].normal_type, int)

0 comments on commit 4b36724

Please sign in to comment.