Skip to content

Commit

Permalink
Make TableSet.aggregate's output more intuitive. Closes #203.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Sep 3, 2015
1 parent 602e83e commit 1119b44
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 43 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
0.7.0
-----

* TableSet.aggregate will now use key_name as the group column name. (#203)
* Added key_name argument to TableSet and Table.group_by.
* Added Length aggregation and removed count from TableSet.aggregate output. (#203)
* Fix error messages for RowDoesNotExistError and ColumnDoesNotExistError.

0.6.0
Expand Down
18 changes: 18 additions & 0 deletions agate/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,28 @@ def run(self, column):

return all(self._test(d) for d in data)

class Length(Aggregation):
"""
Count the total number of values in the column.
Equivalent to Python's :func:`len` function.
"""
def get_aggregate_column_type(self, column):
return NumberType()

def run(self, column):
"""
:returns: :class:`int`
"""
return len(column)

class Count(Aggregation):
"""
Count the number of times a specific value occurs in a column.
If you want to count the total number of values in a column use
:class:`Length`.
:param value: The value to be counted.
"""
def __init__(self, value):
Expand Down
13 changes: 10 additions & 3 deletions agate/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ def left_outer_join(self, left_key, table, right_key):

return self._fork(rows, zip(column_names, column_types))

def group_by(self, key):
def group_by(self, key, key_name=None):
"""
Create a new :class:`Table` for unique value and return them as a
:class:`.TableSet`. The :code:`key` can be either a column name
Expand All @@ -521,14 +521,21 @@ def group_by(self, key):
:param key: Either the name of a column from the this table
to group by, or a :class:`function` that takes a row and returns
a value to group by.
:param key_name: A name that describes the grouped properties.
Defaults to the column name that was grouped on or "group" if
grouping with a key function. See :class:`.TableSet` for more.
:returns: A :class:`.TableSet` mapping where the keys are unique
values from the :code:`key` and the values are new :class:`Table`
instances containing the grouped rows.
:raises: :exc:`.ColumnDoesNotExistError`
"""
key_is_row_function = hasattr(key, '__call__')

if not key_is_row_function:
if key_is_row_function:
key_name = key_name or 'group'
else:
key_name = key_name or key

try:
i = self._column_names.index(key)
except ValueError:
Expand All @@ -552,7 +559,7 @@ def group_by(self, key):
for group, rows in groups.items():
output[group] = self._fork(rows)

return TableSet(output)
return TableSet(output, key_name=key_name)

def compute(self, computations):
"""
Expand Down
11 changes: 8 additions & 3 deletions agate/tableset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,13 @@ class TableSet(Mapping):
values.
:param tables: A dictionary of string keys and :class:`Table` values.
:param group_name: A name that describes the grouping properties. Used as
the column header when the groups are aggregated. Defaults to the
column name that was grouped on.
"""
def __init__(self, group):
def __init__(self, group, key_name='group'):
self._key_name = key_name

self._first_table = list(group.values())[0]
self._column_types = self._first_table.get_column_types()
self._column_names = self._first_table.get_column_names()
Expand Down Expand Up @@ -192,7 +197,7 @@ def aggregate(self, aggregations=[]):
output = []

column_types = [TextType(), NumberType()]
column_names = ['group', 'count']
column_names = [self._key_name]

for column_name, aggregation, new_column_name in aggregations:
c = self._first_table.columns[column_name]
Expand All @@ -201,7 +206,7 @@ def aggregate(self, aggregations=[]):
column_names.append(new_column_name)

for name, table in self._tables.items():
new_row = [name, len(table.rows)]
new_row = [name]

for column_name, aggregation, new_column_name in aggregations:
c = table.columns[column_name]
Expand Down
11 changes: 6 additions & 5 deletions docs/cookbook/excel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,11 @@ You can emulate most of the functionality of Excel's pivot tables using the :met
.. code-block:: python
professions = data.group_by('profession')
summary = professions.aggregate([
('salary', 'mean'),
('salary', 'median')
jobs = employees.group_by('job_title')
summary = jobs.aggregate([
('salary', agate.Length(), 'employee_count')
('salary', agate.Mean(), 'salary_mean'),
('salary', agate.Median(), 'salary_median')
])
The resulting ``summary`` table will have four columns: ``group`` (the profession), ``count`` (the number of grouped rows), ``salary_mean`` and ``salary_median`` (the aggregates).
The resulting ``summary`` table will have four columns: ``job_title`, ``employee_count``, ``salary_mean`` and ``salary_median``.
11 changes: 6 additions & 5 deletions docs/cookbook/sql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -142,16 +142,17 @@ SQL:

.. code-block:: postgres
SELECT mean(age) FROM patient GROUP BY doctor;
SELECT mean(age), median(age) FROM patients GROUP BY doctor;
agate:

.. code-block:: python
doctors = patients.group_by('doctor')
patient_ages = patient.aggregate([
('age', 'mean'),
('age', 'median')
patient_ages = doctors.aggregate([
('age', agate.Length(), 'patient_count')
('age', agate.Mean(), 'age_mean'),
('age', agate.Median(), 'age_median')
])
The resulting table will have four columns: ``group`` (the doctor), ``count`` (the number of patients), ``age_mean`` and ``age_median`` (the aggregates).
The resulting table will have four columns: ``doctor``, ``patient_count``, ``age_mean`` and ``age_median``.
11 changes: 6 additions & 5 deletions docs/cookbook/statistics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ You can also generate aggregate statistics for subsets of data (sometimes colloq

.. code-block:: python
professions = data.group_by('profession')
summary = professions.aggregate([
('salary', 'mean'),
('salary', 'median')
doctors = patients.group_by('doctor')
patient_ages = doctors.aggregate([
('age', agate.Length(), 'patient_count')
('age', agate.Mean(), 'age_mean'),
('age', agate.Median(), 'age_median')
])
The ``summary`` table will have four columns: ``group`` (the profession), ``count`` (the number of grouped rows), ``salary_mean`` and ``salary_median`` (the aggregates).
The resulting table will have four columns: ``doctor``, ``patient_count``, ``age_mean`` and ``age_median``.

Identifying outliers
====================
Expand Down
12 changes: 6 additions & 6 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -325,11 +325,13 @@ First, we use :meth:`.Table.group_by` to group the data by state.
by_state = exonerations.group_by('state')
This takes our original :class:`.Table` and groups it into a :class:`.TableSet`, which contains one table per county. Now we need to aggregate the total for each state. This works in a very similar way to how it did when we were aggregating columns of a single table.
This takes our original :class:`.Table` and groups it into a :class:`.TableSet`, which contains one table per county. Now we need to aggregate the total for each state. This works in a very similar way to how it did when we were aggregating columns of a single table, except that we'll use the :class:`.Length` aggregation to count the total number of values in the column.

.. code-block:: python
state_totals = by_state.aggregate()
state_totals = by_state.aggregate([
('state', agate.Length(), 'count')
])
sorted_totals = state_totals.order_by('count', reverse=True)
Expand All @@ -338,7 +340,7 @@ This takes our original :class:`.Table` and groups it into a :class:`.TableSet`,
::

|--------+--------|
| group | count |
| state | count |
|--------+--------|
| TX | 212 |
| NY | 202 |
Expand All @@ -348,9 +350,7 @@ This takes our original :class:`.Table` and groups it into a :class:`.TableSet`,
| ... | ... |
|--------+--------|

Unsurpringly, the results appear roughly proportional to population.

Because we passed no arguments, :meth:`.TableSet.aggregate` did nothing except group the data and count the elements in each group, but the possiblities are much bigger.
You'll notice we pass a list of tuples to :meth:`.TableSet.aggregate`. Each one includes three elements. The first is the column name to aggregate. The second is an instance of some :class:`.Aggregation`. The third is the new column name. Unsurpringly, in this case the results appear roughly proportional to population.

Question: **What state has the longest median time in prison prior to exoneration?**

Expand Down
14 changes: 14 additions & 0 deletions tests/test_aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,20 @@ def test_all(self):
self.assertEqual(self.table.columns['one'].aggregate(All(lambda d: d != 5)), True)
self.assertEqual(self.table.columns['one'].aggregate(All(lambda d: d == 2)), False)

def test_length(self):
rows = (
(1, 2, 'a'),
(2, 3, 'b'),
(None, 4, 'c'),
(1, 2, 'a'),
(1, 2, 'a')
)

table = Table(rows, self.columns)

self.assertEqual(table.columns['one'].aggregate(Length()), 5)
self.assertEqual(table.columns['two'].aggregate(Length()), 5)

def test_count(self):
rows = (
(1, 2, 'a'),
Expand Down
50 changes: 34 additions & 16 deletions tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,32 +434,50 @@ def setUp(self):
def test_group_by(self):
table = Table(self.rows, self.columns)

new_tables = table.group_by('one')
tableset = table.group_by('one')

self.assertIsInstance(new_tables, TableSet)
self.assertEqual(len(new_tables), 3)
self.assertIsInstance(tableset, TableSet)
self.assertEqual(len(tableset), 3)
self.assertEqual(tableset._key_name, 'one')

self.assertIn('a', new_tables.keys())
self.assertIn('b', new_tables.keys())
self.assertIn('None', new_tables.keys())
self.assertIn('a', tableset.keys())
self.assertIn('b', tableset.keys())
self.assertIn('None', tableset.keys())

self.assertSequenceEqual(new_tables['a'].columns['one'], ('a', 'a'))
self.assertSequenceEqual(new_tables['b'].columns['one'], ('b',))
self.assertSequenceEqual(new_tables['None'].columns['one'], (None,))
self.assertSequenceEqual(tableset['a'].columns['one'], ('a', 'a'))
self.assertSequenceEqual(tableset['b'].columns['one'], ('b',))
self.assertSequenceEqual(tableset['None'].columns['one'], (None,))

def test_group_by_group_name(self):
table = Table(self.rows, self.columns)

tableset = table.group_by('one', key_name='test')

self.assertIsInstance(tableset, TableSet)
self.assertEqual(tableset._key_name, 'test')

def test_group_by_function(self):
table = Table(self.rows, self.columns)

new_tables = table.group_by(lambda r: r['three'] < 5)
tableset = table.group_by(lambda r: r['three'] < 5)

self.assertIsInstance(new_tables, TableSet)
self.assertEqual(len(new_tables), 2)
self.assertIsInstance(tableset, TableSet)
self.assertEqual(len(tableset), 2)
self.assertEqual(tableset._key_name, 'group')

self.assertIn('True', tableset.keys())
self.assertIn('False', tableset.keys())

self.assertSequenceEqual(tableset['True'].columns['one'], ('a', 'a', 'b'))
self.assertSequenceEqual(tableset['False'].columns['one'], (None,))

def test_group_by_function_group_name(self):
table = Table(self.rows, self.columns)

self.assertIn('True', new_tables.keys())
self.assertIn('False', new_tables.keys())
tableset = table.group_by(lambda r: r['three'] < 5, key_name='test')

self.assertSequenceEqual(new_tables['True'].columns['one'], ('a', 'a', 'b'))
self.assertSequenceEqual(new_tables['False'].columns['one'], (None,))
self.assertIsInstance(tableset, TableSet)
self.assertEqual(tableset._key_name, 'test')

def test_group_by_bad_column(self):
table = Table(self.rows, self.columns)
Expand Down
16 changes: 16 additions & 0 deletions tests/test_tableset.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,10 +146,23 @@ def test_compute(self):
self.assertSequenceEqual(new_table._column_types, (self.number_type,))
self.assertSequenceEqual(new_table._column_names, ('number',))

def test_aggregate_grouper_name(self):
tableset = TableSet(self.tables, key_name='test')

new_table = tableset.aggregate([
('number', Length(), 'count')
])

self.assertIsInstance(new_table, Table)
self.assertEqual(len(new_table.rows), 3)
self.assertEqual(len(new_table.columns), 2)
self.assertSequenceEqual(new_table._column_names, ('test', 'count'))

def test_aggregate_sum(self):
tableset = TableSet(self.tables)

new_table = tableset.aggregate([
('number', Length(), 'count'),
('number', Sum(), 'number_sum')
])

Expand All @@ -165,6 +178,7 @@ def test_aggregate_min(self):
tableset = TableSet(self.tables)

new_table = tableset.aggregate([
('number', Length(), 'count'),
('number', Min(), 'number_min')
])

Expand All @@ -181,6 +195,7 @@ def test_aggregate_two_ops(self):
tableset = TableSet(self.tables)

new_table = tableset.aggregate([
('number', Length(), 'count'),
('number', Sum(), 'number_sum'),
('number', Mean(), 'number_mean')
])
Expand All @@ -197,6 +212,7 @@ def test_aggregate_max_length(self):
tableset = TableSet(self.tables)

new_table = tableset.aggregate([
('letter', Length(), 'count'),
('letter', MaxLength(), 'letter_max_length')
])

Expand Down

0 comments on commit 1119b44

Please sign in to comment.