Make TableSet.aggregate's output more intuitive. Closes #203.

wireservice · Sep 3, 2015 · 1119b44 · 1119b44
1 parent 602e83e
commit 1119b44
Show file tree

Hide file tree

Showing 11 changed files with 127 additions and 43 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,6 +1,9 @@
 0.7.0
 -----
 
+* TableSet.aggregate will now use key_name as the group column name. (#203)
+* Added key_name argument to TableSet and Table.group_by.
+* Added Length aggregation and removed count from TableSet.aggregate output. (#203)
 * Fix error messages for RowDoesNotExistError and ColumnDoesNotExistError.
 
 0.6.0

diff --git a/agate/aggregations.py b/agate/aggregations.py
@@ -107,10 +107,28 @@ def run(self, column):
 
         return all(self._test(d) for d in data)
 
+class Length(Aggregation):
+    """
+    Count the total number of values in the column.
+
+    Equivalent to Python's :func:`len` function.
+    """
+    def get_aggregate_column_type(self, column):
+        return NumberType()
+
+    def run(self, column):
+        """
+        :returns: :class:`int`
+        """
+        return len(column)
+
 class Count(Aggregation):
     """
     Count the number of times a specific value occurs in a column.
 
+    If you want to count the total number of values in a column use
+    :class:`Length`.
+
     :param value: The value to be counted.
     """
     def __init__(self, value):

diff --git a/agate/table.py b/agate/table.py
@@ -509,7 +509,7 @@ def left_outer_join(self, left_key, table, right_key):
 
         return self._fork(rows, zip(column_names, column_types))
 
-    def group_by(self, key):
+    def group_by(self, key, key_name=None):
         """
         Create a new :class:`Table` for unique value and return them as a
         :class:`.TableSet`. The :code:`key` can be either a column name
@@ -521,14 +521,21 @@ def group_by(self, key):
         :param key: Either the name of a column from the this table
             to group by, or a :class:`function` that takes a row and returns
             a value to group by.
+        :param key_name: A name that describes the grouped properties.
+            Defaults to the column name that was grouped on or "group" if
+            grouping with a key function. See :class:`.TableSet` for more.
         :returns: A :class:`.TableSet` mapping where the keys are unique
             values from the :code:`key` and the values are new :class:`Table`
             instances containing the grouped rows.
         :raises: :exc:`.ColumnDoesNotExistError`
         """
         key_is_row_function = hasattr(key, '__call__')
 
-        if not key_is_row_function:
+        if key_is_row_function:
+            key_name = key_name or 'group'
+        else:
+            key_name = key_name or key
+
             try:
                 i = self._column_names.index(key)
             except ValueError:
@@ -552,7 +559,7 @@ def group_by(self, key):
         for group, rows in groups.items():
             output[group] = self._fork(rows)
 
-        return TableSet(output)
+        return TableSet(output, key_name=key_name)
 
     def compute(self, computations):
         """

diff --git a/agate/tableset.py b/agate/tableset.py
@@ -61,8 +61,13 @@ class TableSet(Mapping):
     values.
 
     :param tables: A dictionary of string keys and :class:`Table` values.
+    :param group_name: A name that describes the grouping properties. Used as
+        the column header when the groups are aggregated. Defaults to the
+        column name that was grouped on.
     """
-    def __init__(self, group):
+    def __init__(self, group, key_name='group'):
+        self._key_name = key_name
+
         self._first_table = list(group.values())[0]
         self._column_types = self._first_table.get_column_types()
         self._column_names = self._first_table.get_column_names()
@@ -192,7 +197,7 @@ def aggregate(self, aggregations=[]):
         output = []
 
         column_types = [TextType(), NumberType()]
-        column_names = ['group', 'count']
+        column_names = [self._key_name]
 
         for column_name, aggregation, new_column_name in aggregations:
             c = self._first_table.columns[column_name]
@@ -201,7 +206,7 @@ def aggregate(self, aggregations=[]):
             column_names.append(new_column_name)
 
         for name, table in self._tables.items():
-            new_row = [name, len(table.rows)]
+            new_row = [name]
 
             for column_name, aggregation, new_column_name in aggregations:
                 c = table.columns[column_name]

diff --git a/docs/cookbook/excel.rst b/docs/cookbook/excel.rst
@@ -99,10 +99,11 @@ You can emulate most of the functionality of Excel's pivot tables using the :met
 
 .. code-block:: python
 
-    professions = data.group_by('profession')
-    summary = professions.aggregate([
-        ('salary', 'mean'),
-        ('salary', 'median')
+    jobs = employees.group_by('job_title')
+    summary = jobs.aggregate([
+        ('salary', agate.Length(), 'employee_count')
+        ('salary', agate.Mean(), 'salary_mean'),
+        ('salary', agate.Median(), 'salary_median')
     ])
 
-The resulting ``summary`` table will have four columns: ``group`` (the profession), ``count`` (the number of grouped rows), ``salary_mean`` and ``salary_median`` (the aggregates).
+The resulting ``summary`` table will have four columns: ``job_title`, ``employee_count``, ``salary_mean`` and ``salary_median``.
diff --git a/docs/cookbook/sql.rst b/docs/cookbook/sql.rst
@@ -142,16 +142,17 @@ SQL:
 
 .. code-block:: postgres
 
-    SELECT mean(age) FROM patient GROUP BY doctor;
+    SELECT mean(age), median(age) FROM patients GROUP BY doctor;
 
 agate:
 
 .. code-block:: python
 
     doctors = patients.group_by('doctor')
-    patient_ages = patient.aggregate([
-        ('age', 'mean'),
-        ('age', 'median')
+    patient_ages = doctors.aggregate([
+        ('age', agate.Length(), 'patient_count')
+        ('age', agate.Mean(), 'age_mean'),
+        ('age', agate.Median(), 'age_median')
     ])
 
-The resulting table will have four columns: ``group`` (the doctor), ``count`` (the number of patients), ``age_mean`` and ``age_median`` (the aggregates).
+The resulting table will have four columns: ``doctor``, ``patient_count``, ``age_mean`` and ``age_median``.
diff --git a/docs/cookbook/statistics.rst b/docs/cookbook/statistics.rst
@@ -28,13 +28,14 @@ You can also generate aggregate statistics for subsets of data (sometimes colloq
 
 .. code-block:: python
 
-    professions = data.group_by('profession')
-    summary = professions.aggregate([
-        ('salary', 'mean'),
-        ('salary', 'median')
+    doctors = patients.group_by('doctor')
+    patient_ages = doctors.aggregate([
+        ('age', agate.Length(), 'patient_count')
+        ('age', agate.Mean(), 'age_mean'),
+        ('age', agate.Median(), 'age_median')
     ])
 
-The ``summary`` table will have four columns: ``group`` (the profession), ``count`` (the number of grouped rows), ``salary_mean`` and ``salary_median`` (the aggregates).
+The resulting table will have four columns: ``doctor``, ``patient_count``, ``age_mean`` and ``age_median``.
 
 Identifying outliers
 ====================

diff --git a/docs/tutorial.rst b/docs/tutorial.rst
@@ -325,11 +325,13 @@ First, we use :meth:`.Table.group_by` to group the data by state.
 
     by_state = exonerations.group_by('state')
 
-This takes our original :class:`.Table` and groups it into a :class:`.TableSet`, which contains one table per county. Now we need to aggregate the total for each state. This works in a very similar way to how it did when we were aggregating columns of a single table.
+This takes our original :class:`.Table` and groups it into a :class:`.TableSet`, which contains one table per county. Now we need to aggregate the total for each state. This works in a very similar way to how it did when we were aggregating columns of a single table, except that we'll use the :class:`.Length` aggregation to count the total number of values in the column.
 
 .. code-block:: python
 
-    state_totals = by_state.aggregate()
+    state_totals = by_state.aggregate([
+        ('state', agate.Length(), 'count')
+    ])
 
     sorted_totals = state_totals.order_by('count', reverse=True)
 
@@ -338,7 +340,7 @@ This takes our original :class:`.Table` and groups it into a :class:`.TableSet`,
 ::
 
     |--------+--------|
-    |  group | count  |
+    |  state | count  |
     |--------+--------|
     |  TX    | 212    |
     |  NY    | 202    |
@@ -348,9 +350,7 @@ This takes our original :class:`.Table` and groups it into a :class:`.TableSet`,
     |  ...   | ...    |
     |--------+--------|
 
-Unsurpringly, the results appear roughly proportional to population.
-
-Because we passed no arguments, :meth:`.TableSet.aggregate` did nothing except group the data and count the elements in each group, but the possiblities are much bigger.
+You'll notice we pass a list of tuples to :meth:`.TableSet.aggregate`. Each one includes three elements. The first is the column name to aggregate. The second is an instance of some :class:`.Aggregation`. The third is the new column name. Unsurpringly, in this case the results appear roughly proportional to population.
 
 Question: **What state has the longest median time in prison prior to exoneration?**
 

diff --git a/tests/test_aggregations.py b/tests/test_aggregations.py
@@ -42,6 +42,20 @@ def test_all(self):
         self.assertEqual(self.table.columns['one'].aggregate(All(lambda d: d != 5)), True)
         self.assertEqual(self.table.columns['one'].aggregate(All(lambda d: d == 2)), False)
 
+    def test_length(self):
+        rows = (
+            (1, 2, 'a'),
+            (2, 3, 'b'),
+            (None, 4, 'c'),
+            (1, 2, 'a'),
+            (1, 2, 'a')
+        )
+
+        table = Table(rows, self.columns)
+
+        self.assertEqual(table.columns['one'].aggregate(Length()), 5)
+        self.assertEqual(table.columns['two'].aggregate(Length()), 5)
+
     def test_count(self):
         rows = (
             (1, 2, 'a'),

diff --git a/tests/test_table.py b/tests/test_table.py
@@ -434,32 +434,50 @@ def setUp(self):
     def test_group_by(self):
         table = Table(self.rows, self.columns)
 
-        new_tables = table.group_by('one')
+        tableset = table.group_by('one')
 
-        self.assertIsInstance(new_tables, TableSet)
-        self.assertEqual(len(new_tables), 3)
+        self.assertIsInstance(tableset, TableSet)
+        self.assertEqual(len(tableset), 3)
+        self.assertEqual(tableset._key_name, 'one')
 
-        self.assertIn('a', new_tables.keys())
-        self.assertIn('b', new_tables.keys())
-        self.assertIn('None', new_tables.keys())
+        self.assertIn('a', tableset.keys())
+        self.assertIn('b', tableset.keys())
+        self.assertIn('None', tableset.keys())
 
-        self.assertSequenceEqual(new_tables['a'].columns['one'], ('a', 'a'))
-        self.assertSequenceEqual(new_tables['b'].columns['one'], ('b',))
-        self.assertSequenceEqual(new_tables['None'].columns['one'], (None,))
+        self.assertSequenceEqual(tableset['a'].columns['one'], ('a', 'a'))
+        self.assertSequenceEqual(tableset['b'].columns['one'], ('b',))
+        self.assertSequenceEqual(tableset['None'].columns['one'], (None,))
+
+    def test_group_by_group_name(self):
+        table = Table(self.rows, self.columns)
+
+        tableset = table.group_by('one', key_name='test')
+
+        self.assertIsInstance(tableset, TableSet)
+        self.assertEqual(tableset._key_name, 'test')
 
     def test_group_by_function(self):
         table = Table(self.rows, self.columns)
 
-        new_tables = table.group_by(lambda r: r['three'] < 5)
+        tableset = table.group_by(lambda r: r['three'] < 5)
 
-        self.assertIsInstance(new_tables, TableSet)
-        self.assertEqual(len(new_tables), 2)
+        self.assertIsInstance(tableset, TableSet)
+        self.assertEqual(len(tableset), 2)
+        self.assertEqual(tableset._key_name, 'group')
+
+        self.assertIn('True', tableset.keys())
+        self.assertIn('False', tableset.keys())
+
+        self.assertSequenceEqual(tableset['True'].columns['one'], ('a', 'a', 'b'))
+        self.assertSequenceEqual(tableset['False'].columns['one'], (None,))
+
+    def test_group_by_function_group_name(self):
+        table = Table(self.rows, self.columns)
 
-        self.assertIn('True', new_tables.keys())
-        self.assertIn('False', new_tables.keys())
+        tableset = table.group_by(lambda r: r['three'] < 5, key_name='test')
 
-        self.assertSequenceEqual(new_tables['True'].columns['one'], ('a', 'a', 'b'))
-        self.assertSequenceEqual(new_tables['False'].columns['one'], (None,))
+        self.assertIsInstance(tableset, TableSet)
+        self.assertEqual(tableset._key_name, 'test')
 
     def test_group_by_bad_column(self):
         table = Table(self.rows, self.columns)

diff --git a/tests/test_tableset.py b/tests/test_tableset.py
@@ -146,10 +146,23 @@ def test_compute(self):
             self.assertSequenceEqual(new_table._column_types, (self.number_type,))
             self.assertSequenceEqual(new_table._column_names, ('number',))
 
+    def test_aggregate_grouper_name(self):
+        tableset = TableSet(self.tables, key_name='test')
+
+        new_table = tableset.aggregate([
+            ('number', Length(), 'count')
+        ])
+
+        self.assertIsInstance(new_table, Table)
+        self.assertEqual(len(new_table.rows), 3)
+        self.assertEqual(len(new_table.columns), 2)
+        self.assertSequenceEqual(new_table._column_names, ('test', 'count'))
+
     def test_aggregate_sum(self):
         tableset = TableSet(self.tables)
 
         new_table = tableset.aggregate([
+            ('number', Length(), 'count'),
             ('number', Sum(), 'number_sum')
         ])
 
@@ -165,6 +178,7 @@ def test_aggregate_min(self):
         tableset = TableSet(self.tables)
 
         new_table = tableset.aggregate([
+            ('number', Length(), 'count'),
             ('number', Min(), 'number_min')
         ])
 
@@ -181,6 +195,7 @@ def test_aggregate_two_ops(self):
         tableset = TableSet(self.tables)
 
         new_table = tableset.aggregate([
+            ('number', Length(), 'count'),
             ('number', Sum(), 'number_sum'),
             ('number', Mean(), 'number_mean')
         ])
@@ -197,6 +212,7 @@ def test_aggregate_max_length(self):
         tableset = TableSet(self.tables)
 
         new_table = tableset.aggregate([
+            ('letter', Length(), 'count'),
             ('letter', MaxLength(), 'letter_max_length')
         ])