Nested TableSet's and multi-dimensional aggregates. Closes #204.

wireservice · Sep 3, 2015 · 2efc6aa · 2efc6aa
1 parent c8bb56b
commit 2efc6aa
Show file tree

Hide file tree

Showing 6 changed files with 194 additions and 32 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,7 @@
 0.7.0
 -----
 
+* Nested TableSet's and multi-dimensional aggregates. (#204)
 * TableSet.aggregate will now use key_name as the group column name. (#203)
 * Added key_name argument to TableSet and Table.group_by.
 * Added Length aggregation and removed count from TableSet.aggregate output. (#203)

diff --git a/agate/table.py b/agate/table.py
@@ -75,7 +75,7 @@ def __init__(self, rows, column_info):
 
         for i, row in enumerate(rows):
             if len(row) != len_column_names:
-                raise ValueError('Row %i has length %i, but Table only has %i columns.' % (i, len(row), len_column_types))
+                raise ValueError('Row %i has length %i, but Table only has %i columns.' % (i, len(row), len_column_names))
 
             # Forked tables can share data (because they are immutable)
             # but original data should be buffered so it can't be changed
@@ -549,12 +549,14 @@ def group_by(self, key, key_name=None):
             else:
                 group_name = six.text_type(row[i])
 
+            # print group_name
+
             if group_name not in groups:
                 groups[group_name] = []
 
             groups[group_name].append(row)
 
-        output = {}
+        output = OrderedDict()
 
         for group, rows in groups.items():
             output[group] = self._fork(rows)

diff --git a/agate/tableset.py b/agate/tableset.py
@@ -17,6 +17,10 @@
 :meth:`TableSet.order_by` are used, the operation is applied to *each* table
 in the set and the result is a new :class:`TableSet` instance made up of
 entirely new :class:`.Table` instances.
+
+:class:`TableSet` instances can also contain other TableSet's. This means you
+can chain calls to :class:`.Table.aggregate` and :class:`TableSet.aggregeate`
+and end up with data aggregated across multiple dimensions.
 """
 
 from collections import Mapping
@@ -49,7 +53,7 @@ def __call__(self, *args, **kwargs):
         for key, value in self.tableset._tables.items():
             groups[key] = getattr(value, self.method_name)(*args, **kwargs)
 
-        return TableSet(groups)
+        return TableSet(groups, key_name=self.tableset._key_name)
 
 class TableSet(Mapping):
     """
@@ -68,10 +72,11 @@ class TableSet(Mapping):
     def __init__(self, group, key_name='group'):
         self._key_name = key_name
 
-        self._sample_table = group.values()[0]
+        self._sample_table = self
 
         while isinstance(self._sample_table, TableSet):
-            self._sample_table = self._sample_table.values()[0]
+            # Note: list call is a workaround for Python 3 "ValuesView"
+            self._sample_table = list(self._sample_table.values())[0]
 
         self._column_types = self._sample_table.get_column_types()
         self._column_names = self._sample_table.get_column_names()
@@ -96,7 +101,7 @@ def __init__(self, group, key_name='group'):
         self.distinct = TableMethodProxy(self, 'distinct')
         self.inner_join = TableMethodProxy(self, 'inner_join')
         self.left_outer_join = TableMethodProxy(self, 'left_outer_join')
-        # self.group_by = TableMethodProxy(self, 'group_by')
+        self.group_by = TableMethodProxy(self, 'group_by')
         self.compute = TableMethodProxy(self, 'compute')
         self.percent_change = TableMethodProxy(self, 'percent_change')
         self.rank = TableMethodProxy(self, 'rank')
@@ -181,15 +186,56 @@ def get_column_names(self):
         """
         return self._column_names
 
+    def _aggregate(self, aggregations=[]):
+        """
+        Recursive aggregation allowing for TableSet's to be nested inside
+        one another.
+
+        See :meth:`TableSet.aggregate` for the user-facing API.
+        """
+        output = []
+
+        # Process nested TableSet's
+        if isinstance(list(self._tables.values())[0], TableSet):
+            for key, tableset in self._tables.items():
+                column_names, column_types, nested_output = tableset._aggregate(aggregations)
+
+                for row in nested_output:
+                    row.insert(0, key)
+
+                    output.append(row)
+
+            column_names.insert(0, self._key_name)
+            column_types.insert(0, TextType())
+        # Regular Tables
+        else:
+            column_names = [self._key_name]
+            column_types = [TextType()]
+
+            for column_name, aggregation, new_column_name in aggregations:
+                c = self._sample_table.columns[column_name]
+
+                column_names.append(new_column_name)
+                column_types.append(aggregation.get_aggregate_column_type(c))
+
+            for name, table in self._tables.items():
+                new_row = [name]
+
+                for column_name, aggregation, new_column_name in aggregations:
+                    c = table.columns[column_name]
+
+                    new_row.append(c.aggregate(aggregation))
+
+                output.append(new_row)
+
+        return column_names, column_types, output
+
     def aggregate(self, aggregations=[]):
         """
         Aggregate data from the tables in this set by performing some
         set of column operations on the groups and coalescing the results into
         a new :class:`.Table`.
 
-        :class:`group` and :class:`count` columns will always be included as at
-        the beginning of the output table, before the aggregated columns.
-
         :code:`aggregations` must be a list of tuples, where each has three
         parts: a :code:`column_name`, a :class:`.Aggregation` instance and a
         :code:`new_column_name`.
@@ -198,25 +244,6 @@ def aggregate(self, aggregations=[]):
             :code:`(column_name, aggregation, new_column_name)`.
         :returns: A new :class:`.Table`.
         """
-        output = []
-
-        column_types = [TextType(), NumberType()]
-        column_names = [self._key_name]
-
-        for column_name, aggregation, new_column_name in aggregations:
-            c = self._sample_table.columns[column_name]
-
-            column_types.append(aggregation.get_aggregate_column_type(c))
-            column_names.append(new_column_name)
-
-        for name, table in self._tables.items():
-            new_row = [name]
-
-            for column_name, aggregation, new_column_name in aggregations:
-                c = table.columns[column_name]
-
-                new_row.append(c.aggregate(aggregation))
-
-            output.append(tuple(new_row))
+        column_names, column_types, output = self._aggregate(aggregations)
 
         return self._sample_table._fork(output, zip(column_names, column_types))
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -365,6 +365,7 @@ This is a much more complicated question that's going to pull together a lot of
     state_totals = with_years_in_prison.group_by('state')
 
     medians = state_totals.aggregate([
+        ('years_in_prison', agate.Length(), 'count')
         ('years_in_prison', agate.Median(), 'median_years_in_prison')
     ])
 
@@ -375,7 +376,7 @@ This is a much more complicated question that's going to pull together a lot of
 ::
 
     |--------+-------+-------------------------|
-    |  group | count | median_years_in_prison  |
+    |  state | count | median_years_in_prison  |
     |--------+-------+-------------------------|
     |  DC    | 15    | 27                      |
     |  NE    | 9     | 20                      |
@@ -389,9 +390,68 @@ DC? Nebraska? What accounts for these states having the longest times in prison
 
 As with :meth:`.Table.aggregate` and :meth:`.Table.compute`, the :meth:`.TableSet.aggregate`: method takes a list of aggregations to perform. You can aggregate as many columns as you like in a single step and they will all appear in the output table.
 
+Multi-dimensional aggregations
+==============================
+
+Before we wrap up, let's try one more thing. I've already shown you that you can use :class:`.TableSet` to group instances of :class:`.Table`. However, you can also use a :class:`.TableSet` to group other instances of :class:`.TableSet`. To put that another way, instance sof :class:`.TableSet` can be *nested*.
+
+The key to nesting data in this way is to use :meth:`.TableSet.group_by`. Just as we used :meth:`.Table.group_by` to split data up into a group of tables, you can use :meth:`.TableSet.group_by` to further subdivide that data. Effectively this means you can create multi-dimensional groupings. Let's look at a concrete example.
+
+Question: **Is there a collective relationship between race, age and time spent in prison prior to exoneration?**
+
+I'm not going to explain every stage of this analysis as most of it users features you've seen before. The key part to look for is the two separate calls to ``group_by``:
+
+.. code-block:: python
+
+    # Filters rows without age data
+    only_with_age = data['with_years_in_prison'].where(
+        lambda r: r['age'] is not None
+    )
+
+    # Group by race
+    race_groups = only_with_age.group_by('race')
+
+    # Sub-group by age cohorts (20s, 30s, etc.)
+    race_and_age_groups = race_groups.group_by(
+        lambda r: '%i0s' % (r['age'] // 10),
+        key_name='age_group'
+    )
+
+    # Aggregate medians for each group
+    medians = race_and_age_groups.aggregate([
+        ('years_in_prison', agate.Length(), 'count'),
+        ('years_in_prison', agate.Median(), 'median_years_in_prison')
+    ])
+
+    # Sort the results
+    sorted_groups = medians.order_by('median_years_in_prison', reverse=True)
+
+    # Print out the results
+    print(sorted_groups.format(max_rows=10))
+
+::
+
+    |------------------+-----------+-------+-------------------------|
+    |  race            | age_group | count | median_years_in_prison  |
+    |------------------+-----------+-------+-------------------------|
+    |  Native American | 20s       | 2     | 21.5                    |
+    |                  | 20s       | 1     | 19                      |
+    |  Native American | 10s       | 2     | 15                      |
+    |  Native American | 30s       | 2     | 14.5                    |
+    |  Black           | 10s       | 188   | 14                      |
+    |  Black           | 20s       | 358   | 13                      |
+    |  Asian           | 20s       | 4     | 12                      |
+    |  Black           | 30s       | 156   | 10                      |
+    |  Caucasian       | 10s       | 76    | 8                       |
+    |  Caucasian       | 20s       | 255   | 8                       |
+    |  ...             | ...       | ...   | ...                     |
+    |------------------+-----------+-------+-------------------------|
+
+Well, what are you waiting for? Get to reporting!
+
 Where to go next
 ================
 
-This tutorial only scratches the surface of agate's features. For many more ideas on how to apply agate, check out the :doc:`cookbook`, which includes dozens of examples showing how to substitute agate for common operations used in Excel, SQL, R and more.
+This tutorial only scratches the surface of agate's features. For many more ideas on how to apply agate, check out the :doc:`cookbook`, which includes dozens of examples showing how to substitute agate for common patterns used in Excel, SQL, R and more.
 
 Also, if you're going to be doing data processing in Python you really ought to check out `proof <http://proof.readthedocs.org/en/latest/>`_, a library for building data processing pipelines that are repeatable and self-documenting. It will make your code cleaner and save you tons of time.
diff --git a/exonerations.py b/exonerations.py
@@ -3,6 +3,7 @@
 import csv
 
 import agate
+import proof
 
 def load_data(data):
     text_type = agate.TextType()
@@ -74,13 +75,40 @@ def states(data):
 
     print(sorted_medians.format(max_rows=5))
 
+def race_and_age(data):
+    # Filters rows without age data
+    only_with_age = data['with_years_in_prison'].where(
+        lambda r: r['age'] is not None
+    )
+
+    # Group by race
+    race_groups = only_with_age.group_by('race')
+
+    # Sub-group by age cohorts (20s, 30s, etc.)
+    race_and_age_groups = race_groups.group_by(
+        lambda r: '%i0s' % (r['age'] // 10),
+        key_name='age_group'
+    )
+
+    # Aggregate medians for each group
+    medians = race_and_age_groups.aggregate([
+        ('years_in_prison', agate.Length(), 'count'),
+        ('years_in_prison', agate.Median(), 'median_years_in_prison')
+    ])
+
+    # Sort the results
+    sorted_groups = medians.order_by('median_years_in_prison', reverse=True)
+
+    # Print out the results
+    print(sorted_groups.format(max_rows=10))
 
-analysis = agate.Analysis(load_data)
+analysis = proof.Analysis(load_data)
 analysis.then(confessions)
 analysis.then(median_age)
 analysis.then(youth)
 
 years_analysis = analysis.then(years_in_prison)
 years_analysis.then(states)
+years_analysis.then(race_and_age)
 
 analysis.run()
diff --git a/tests/test_tableset.py b/tests/test_tableset.py
@@ -238,3 +238,47 @@ def test_aggregeate_bad_column(self):
 
         with self.assertRaises(ColumnDoesNotExistError):
             tableset.aggregate([('bad', Sum(), 'bad_sum')])
+
+    def test_nested(self):
+        tableset = TableSet(self.tables, key_name='test')
+
+        nested = tableset.group_by('letter')
+
+        self.assertIsInstance(nested, TableSet)
+        self.assertEqual(len(nested), 3)
+        self.assertSequenceEqual(nested._column_names, ('letter', 'number'))
+        self.assertSequenceEqual(nested._column_types, (self.text_type, self.number_type))
+
+        self.assertIsInstance(nested['table1'], TableSet)
+        self.assertEqual(len(nested['table1']), 2)
+        self.assertSequenceEqual(nested['table1']._column_names, ('letter', 'number'))
+        self.assertSequenceEqual(nested['table1']._column_types, (self.text_type, self.number_type))
+
+        self.assertIsInstance(nested['table1']['a'], Table)
+        self.assertEqual(len(nested['table1']['a'].columns), 2)
+        self.assertEqual(len(nested['table1']['a'].rows), 2)
+
+    def test_nested_aggregation(self):
+        tableset = TableSet(self.tables, key_name='test')
+
+        nested = tableset.group_by('letter')
+
+        results = nested.aggregate([
+            ('letter', Length(), 'count'),
+            ('number', Sum(), 'number_sum')
+        ])
+
+        self.assertIsInstance(results, Table)
+        self.assertEqual(len(results.rows), 7)
+        self.assertEqual(len(results.columns), 4)
+        self.assertSequenceEqual(results._column_names, ('test', 'letter', 'count', 'number_sum'))
+
+        self.assertSequenceEqual(results.rows[0], ('table1', 'a', 2, 4))
+        self.assertSequenceEqual(results.rows[1], ('table1', 'b', 1, 2))
+
+        self.assertSequenceEqual(results.rows[2], ('table2', 'b', 1, 0))
+        self.assertSequenceEqual(results.rows[3], ('table2', 'a', 1, 2))
+        self.assertSequenceEqual(results.rows[4], ('table2', 'c', 1, 5))
+
+        self.assertSequenceEqual(results.rows[5], ('table3', 'a', 2, 3))
+        self.assertSequenceEqual(results.rows[6], ('table3', 'c', 1, 3))