First pass at accumulating errors when normalizing a table (PANDA).

wireservice · Sep 30, 2011 · 4b36724 · 4b36724
1 parent b12f9dd
commit 4b36724
Show file tree

Hide file tree

Showing 3 changed files with 94 additions and 18 deletions.
diff --git a/csvkit/exceptions.py b/csvkit/exceptions.py
@@ -74,3 +74,14 @@ def __init__(self, index, value, normal_type):
         self.normal_type = normal_type
         msg = 'Unable to convert "%s" to type %s (at index %i)' % (value, normal_type, index)
         super(InvalidValueForTypeException, self).__init__(msg)
+
+class InvalidValueForTypeListException(CustomException):
+    """
+    Exception raised when one or more InvalidValueForTypeException
+    has been raised while accumulating errors.
+    """
+    def __init__(self, errors):
+        self.errors = errors
+        msg = 'Encountered errors converting values in %i columns' % len(errors)
+        super(InvalidValueForTypeListException, self).__init__(msg)
+
diff --git a/csvkit/typeinference.py b/csvkit/typeinference.py
@@ -5,7 +5,7 @@
 
 from dateutil.parser import parse
 
-from exceptions import InvalidValueForTypeException
+from exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
 
 NULL_VALUES = ('na', 'n/a', 'none', 'null', '.')
 TRUE_VALUES = ('yes', 'y', 'true', 't')
@@ -171,23 +171,48 @@ def normalize_column_type(l, normal_type=None):
     # Don't know what they are, so they must just be strings 
     return unicode, [x if x != '' else None for x in l]
 
-def normalize_table(rows, column_count):
+def normalize_table(rows, normal_types=None, accumulate_errors=False):
     """
     Given a sequence of sequences, normalize the lot.
+
+    Optionally accepts a normal_types parameter which is a list of
+    types that the columns must normalize to.
     """
-    data_columns = [[] for x in range(column_count)]
+    data_columns = []
+    column_count = 0
+    row_count = 0
 
     for row in rows:
+        while column_count < len(row):
+            data_columns.append([None] * row_count)
+            column_count += 1
+
         for data_column, value in zip(data_columns, row):
             data_column.append(value)
 
-    normal_types = []
-    normal_columns= []
+        row_count += 1
+
+    new_normal_types = []
+    new_normal_columns= []
+    errors = {}
 
-    for column in data_columns:
-        t, c = normalize_column_type(column)
-        normal_types.append(t)
-        normal_columns.append(c)
+    for i, column in enumerate(data_columns):
+        try:
+            if normal_types:
+                t, c = normalize_column_type(column, normal_types[i])
+            else:
+                t, c = normalize_column_type(column)
+
+            new_normal_types.append(t)
+            new_normal_columns.append(c)
+        except InvalidValueForTypeException, e:
+            if not accumulate_errors:
+                raise                
+
+            errors[i] = e
 
-    return normal_types, normal_columns
+    if errors:
+        raise InvalidValueForTypeListException(errors)
+
+    return new_normal_types, new_normal_columns
 
diff --git a/tests/test_typeinference.py b/tests/test_typeinference.py
@@ -6,7 +6,7 @@
 
 from csvkit import typeinference
 
-from csvkit.exceptions import InvalidValueForTypeException
+from csvkit.exceptions import InvalidValueForTypeException, InvalidValueForTypeListException
 
 class TestNormalizeType(unittest.TestCase):
     def test_nulls(self):
@@ -151,28 +151,68 @@ def test_datetimes_and_dates_coerce(self):
     def test_datetimes_and_times(self):
         self.assertEqual((unicode, ['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', None]), typeinference.normalize_column_type(['Jan 1, 2008 at 4:40 AM', '2010-01-27T03:45:00', '16:14:45', '']))
 
-
     def test_normalize_table(self):
         expected_types = [unicode, int, float, NoneType]
         data = [
             ['a','1','2.1', ''],
-            ['b', '5', '4.1', ''],
+            ['b', '5', '4.1'],
             ['c', '100', '100.9999', ''],
             ['d', '2', '5.3', '']
         ]
-        column_count = len(expected_types)
-        types, columns = typeinference.normalize_table(data, column_count)
+        types, columns = typeinference.normalize_table(data)
 
-        self.assertEqual(column_count, len(types))
-        self.assertEqual(column_count, len(columns))
+        self.assertEqual(4, len(types))
+        self.assertEqual(4, len(columns))
 
         for i, tup in enumerate(zip(columns, types, expected_types)):
             c, t, et = tup
             self.assertEqual(et, t)
             for row, normalized in zip(data, c):
                 if t is NoneType:
                     self.assertTrue(normalized is None)
-                    self.assertEqual('', row[i])
                 else:
                     self.assertEqual(t(row[i]), normalized)
 
+    def test_normalize_table_known_types(self):
+        normal_types = [unicode, int, float, NoneType]
+        data = [
+            ['a','1','2.1', ''],
+            ['b', '5', '4.1'],
+            ['c', '100', '100.9999', ''],
+            ['d', '2', '5.3', '']
+        ]
+        types, columns = typeinference.normalize_table(data, normal_types)
+
+        self.assertEqual(4, len(types))
+        self.assertEqual(4, len(columns))
+
+        for i, tup in enumerate(zip(columns, types, normal_types)):
+            c, t, et = tup
+            self.assertEqual(et, t)
+            for row, normalized in zip(data, c):
+                if t is NoneType:
+                    self.assertTrue(normalized is None)
+                else:
+                    self.assertEqual(t(row[i]), normalized)
+
+    def test_normalize_table_known_types_invalid(self):
+        normal_types = [bool, int, int, NoneType]
+        data = [
+            ['a','1','2.1', ''],
+            ['b', '5', '4.1'],
+            ['c', '100', '100.9999', ''],
+            ['d', '2', '5.3', '']
+        ]
+
+        try:
+            typeinference.normalize_table(data, normal_types, accumulate_errors=True)
+            self.assertEqual(True, False)
+        except InvalidValueForTypeListException, e:
+            self.assertEqual(len(e.errors), 2)
+            self.assertEqual(e.errors[0].index, 0)
+            self.assertEqual(e.errors[0].value, 'a')
+            self.assertEqual(e.errors[0].normal_type, bool)
+            self.assertEqual(e.errors[2].index, 0)
+            self.assertEqual(e.errors[2].value, '2.1')
+            self.assertEqual(e.errors[2].normal_type, int)
+