Skip to content

Commit

Permalink
Nested TableSet's and multi-dimensional aggregates. Closes #204.
Browse files Browse the repository at this point in the history
  • Loading branch information
onyxfish committed Sep 3, 2015
1 parent c8bb56b commit 2efc6aa
Show file tree
Hide file tree
Showing 6 changed files with 194 additions and 32 deletions.
1 change: 1 addition & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
0.7.0
-----

* Nested TableSet's and multi-dimensional aggregates. (#204)
* TableSet.aggregate will now use key_name as the group column name. (#203)
* Added key_name argument to TableSet and Table.group_by.
* Added Length aggregation and removed count from TableSet.aggregate output. (#203)
Expand Down
6 changes: 4 additions & 2 deletions agate/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __init__(self, rows, column_info):

for i, row in enumerate(rows):
if len(row) != len_column_names:
raise ValueError('Row %i has length %i, but Table only has %i columns.' % (i, len(row), len_column_types))
raise ValueError('Row %i has length %i, but Table only has %i columns.' % (i, len(row), len_column_names))

# Forked tables can share data (because they are immutable)
# but original data should be buffered so it can't be changed
Expand Down Expand Up @@ -549,12 +549,14 @@ def group_by(self, key, key_name=None):
else:
group_name = six.text_type(row[i])

# print group_name

if group_name not in groups:
groups[group_name] = []

groups[group_name].append(row)

output = {}
output = OrderedDict()

for group, rows in groups.items():
output[group] = self._fork(rows)
Expand Down
81 changes: 54 additions & 27 deletions agate/tableset.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
:meth:`TableSet.order_by` are used, the operation is applied to *each* table
in the set and the result is a new :class:`TableSet` instance made up of
entirely new :class:`.Table` instances.
:class:`TableSet` instances can also contain other TableSet's. This means you
can chain calls to :class:`.Table.aggregate` and :class:`TableSet.aggregeate`
and end up with data aggregated across multiple dimensions.
"""

from collections import Mapping
Expand Down Expand Up @@ -49,7 +53,7 @@ def __call__(self, *args, **kwargs):
for key, value in self.tableset._tables.items():
groups[key] = getattr(value, self.method_name)(*args, **kwargs)

return TableSet(groups)
return TableSet(groups, key_name=self.tableset._key_name)

class TableSet(Mapping):
"""
Expand All @@ -68,10 +72,11 @@ class TableSet(Mapping):
def __init__(self, group, key_name='group'):
self._key_name = key_name

self._sample_table = group.values()[0]
self._sample_table = self

while isinstance(self._sample_table, TableSet):
self._sample_table = self._sample_table.values()[0]
# Note: list call is a workaround for Python 3 "ValuesView"
self._sample_table = list(self._sample_table.values())[0]

self._column_types = self._sample_table.get_column_types()
self._column_names = self._sample_table.get_column_names()
Expand All @@ -96,7 +101,7 @@ def __init__(self, group, key_name='group'):
self.distinct = TableMethodProxy(self, 'distinct')
self.inner_join = TableMethodProxy(self, 'inner_join')
self.left_outer_join = TableMethodProxy(self, 'left_outer_join')
# self.group_by = TableMethodProxy(self, 'group_by')
self.group_by = TableMethodProxy(self, 'group_by')
self.compute = TableMethodProxy(self, 'compute')
self.percent_change = TableMethodProxy(self, 'percent_change')
self.rank = TableMethodProxy(self, 'rank')
Expand Down Expand Up @@ -181,15 +186,56 @@ def get_column_names(self):
"""
return self._column_names

def _aggregate(self, aggregations=[]):
"""
Recursive aggregation allowing for TableSet's to be nested inside
one another.
See :meth:`TableSet.aggregate` for the user-facing API.
"""
output = []

# Process nested TableSet's
if isinstance(list(self._tables.values())[0], TableSet):
for key, tableset in self._tables.items():
column_names, column_types, nested_output = tableset._aggregate(aggregations)

for row in nested_output:
row.insert(0, key)

output.append(row)

column_names.insert(0, self._key_name)
column_types.insert(0, TextType())
# Regular Tables
else:
column_names = [self._key_name]
column_types = [TextType()]

for column_name, aggregation, new_column_name in aggregations:
c = self._sample_table.columns[column_name]

column_names.append(new_column_name)
column_types.append(aggregation.get_aggregate_column_type(c))

for name, table in self._tables.items():
new_row = [name]

for column_name, aggregation, new_column_name in aggregations:
c = table.columns[column_name]

new_row.append(c.aggregate(aggregation))

output.append(new_row)

return column_names, column_types, output

def aggregate(self, aggregations=[]):
"""
Aggregate data from the tables in this set by performing some
set of column operations on the groups and coalescing the results into
a new :class:`.Table`.
:class:`group` and :class:`count` columns will always be included as at
the beginning of the output table, before the aggregated columns.
:code:`aggregations` must be a list of tuples, where each has three
parts: a :code:`column_name`, a :class:`.Aggregation` instance and a
:code:`new_column_name`.
Expand All @@ -198,25 +244,6 @@ def aggregate(self, aggregations=[]):
:code:`(column_name, aggregation, new_column_name)`.
:returns: A new :class:`.Table`.
"""
output = []

column_types = [TextType(), NumberType()]
column_names = [self._key_name]

for column_name, aggregation, new_column_name in aggregations:
c = self._sample_table.columns[column_name]

column_types.append(aggregation.get_aggregate_column_type(c))
column_names.append(new_column_name)

for name, table in self._tables.items():
new_row = [name]

for column_name, aggregation, new_column_name in aggregations:
c = table.columns[column_name]

new_row.append(c.aggregate(aggregation))

output.append(tuple(new_row))
column_names, column_types, output = self._aggregate(aggregations)

return self._sample_table._fork(output, zip(column_names, column_types))
64 changes: 62 additions & 2 deletions docs/tutorial.rst
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,7 @@ This is a much more complicated question that's going to pull together a lot of
state_totals = with_years_in_prison.group_by('state')
medians = state_totals.aggregate([
('years_in_prison', agate.Length(), 'count')
('years_in_prison', agate.Median(), 'median_years_in_prison')
])
Expand All @@ -375,7 +376,7 @@ This is a much more complicated question that's going to pull together a lot of
::

|--------+-------+-------------------------|
| group | count | median_years_in_prison |
| state | count | median_years_in_prison |
|--------+-------+-------------------------|
| DC | 15 | 27 |
| NE | 9 | 20 |
Expand All @@ -389,9 +390,68 @@ DC? Nebraska? What accounts for these states having the longest times in prison

As with :meth:`.Table.aggregate` and :meth:`.Table.compute`, the :meth:`.TableSet.aggregate`: method takes a list of aggregations to perform. You can aggregate as many columns as you like in a single step and they will all appear in the output table.

Multi-dimensional aggregations
==============================

Before we wrap up, let's try one more thing. I've already shown you that you can use :class:`.TableSet` to group instances of :class:`.Table`. However, you can also use a :class:`.TableSet` to group other instances of :class:`.TableSet`. To put that another way, instance sof :class:`.TableSet` can be *nested*.

The key to nesting data in this way is to use :meth:`.TableSet.group_by`. Just as we used :meth:`.Table.group_by` to split data up into a group of tables, you can use :meth:`.TableSet.group_by` to further subdivide that data. Effectively this means you can create multi-dimensional groupings. Let's look at a concrete example.

Question: **Is there a collective relationship between race, age and time spent in prison prior to exoneration?**

I'm not going to explain every stage of this analysis as most of it users features you've seen before. The key part to look for is the two separate calls to ``group_by``:

.. code-block:: python
# Filters rows without age data
only_with_age = data['with_years_in_prison'].where(
lambda r: r['age'] is not None
)
# Group by race
race_groups = only_with_age.group_by('race')
# Sub-group by age cohorts (20s, 30s, etc.)
race_and_age_groups = race_groups.group_by(
lambda r: '%i0s' % (r['age'] // 10),
key_name='age_group'
)
# Aggregate medians for each group
medians = race_and_age_groups.aggregate([
('years_in_prison', agate.Length(), 'count'),
('years_in_prison', agate.Median(), 'median_years_in_prison')
])
# Sort the results
sorted_groups = medians.order_by('median_years_in_prison', reverse=True)
# Print out the results
print(sorted_groups.format(max_rows=10))
::

|------------------+-----------+-------+-------------------------|
| race | age_group | count | median_years_in_prison |
|------------------+-----------+-------+-------------------------|
| Native American | 20s | 2 | 21.5 |
| | 20s | 1 | 19 |
| Native American | 10s | 2 | 15 |
| Native American | 30s | 2 | 14.5 |
| Black | 10s | 188 | 14 |
| Black | 20s | 358 | 13 |
| Asian | 20s | 4 | 12 |
| Black | 30s | 156 | 10 |
| Caucasian | 10s | 76 | 8 |
| Caucasian | 20s | 255 | 8 |
| ... | ... | ... | ... |
|------------------+-----------+-------+-------------------------|

Well, what are you waiting for? Get to reporting!

Where to go next
================

This tutorial only scratches the surface of agate's features. For many more ideas on how to apply agate, check out the :doc:`cookbook`, which includes dozens of examples showing how to substitute agate for common operations used in Excel, SQL, R and more.
This tutorial only scratches the surface of agate's features. For many more ideas on how to apply agate, check out the :doc:`cookbook`, which includes dozens of examples showing how to substitute agate for common patterns used in Excel, SQL, R and more.

Also, if you're going to be doing data processing in Python you really ought to check out `proof <http://proof.readthedocs.org/en/latest/>`_, a library for building data processing pipelines that are repeatable and self-documenting. It will make your code cleaner and save you tons of time.
30 changes: 29 additions & 1 deletion exonerations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv

import agate
import proof

def load_data(data):
text_type = agate.TextType()
Expand Down Expand Up @@ -74,13 +75,40 @@ def states(data):

print(sorted_medians.format(max_rows=5))

def race_and_age(data):
# Filters rows without age data
only_with_age = data['with_years_in_prison'].where(
lambda r: r['age'] is not None
)

# Group by race
race_groups = only_with_age.group_by('race')

# Sub-group by age cohorts (20s, 30s, etc.)
race_and_age_groups = race_groups.group_by(
lambda r: '%i0s' % (r['age'] // 10),
key_name='age_group'
)

# Aggregate medians for each group
medians = race_and_age_groups.aggregate([
('years_in_prison', agate.Length(), 'count'),
('years_in_prison', agate.Median(), 'median_years_in_prison')
])

# Sort the results
sorted_groups = medians.order_by('median_years_in_prison', reverse=True)

# Print out the results
print(sorted_groups.format(max_rows=10))

analysis = agate.Analysis(load_data)
analysis = proof.Analysis(load_data)
analysis.then(confessions)
analysis.then(median_age)
analysis.then(youth)

years_analysis = analysis.then(years_in_prison)
years_analysis.then(states)
years_analysis.then(race_and_age)

analysis.run()
44 changes: 44 additions & 0 deletions tests/test_tableset.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,3 +238,47 @@ def test_aggregeate_bad_column(self):

with self.assertRaises(ColumnDoesNotExistError):
tableset.aggregate([('bad', Sum(), 'bad_sum')])

def test_nested(self):
tableset = TableSet(self.tables, key_name='test')

nested = tableset.group_by('letter')

self.assertIsInstance(nested, TableSet)
self.assertEqual(len(nested), 3)
self.assertSequenceEqual(nested._column_names, ('letter', 'number'))
self.assertSequenceEqual(nested._column_types, (self.text_type, self.number_type))

self.assertIsInstance(nested['table1'], TableSet)
self.assertEqual(len(nested['table1']), 2)
self.assertSequenceEqual(nested['table1']._column_names, ('letter', 'number'))
self.assertSequenceEqual(nested['table1']._column_types, (self.text_type, self.number_type))

self.assertIsInstance(nested['table1']['a'], Table)
self.assertEqual(len(nested['table1']['a'].columns), 2)
self.assertEqual(len(nested['table1']['a'].rows), 2)

def test_nested_aggregation(self):
tableset = TableSet(self.tables, key_name='test')

nested = tableset.group_by('letter')

results = nested.aggregate([
('letter', Length(), 'count'),
('number', Sum(), 'number_sum')
])

self.assertIsInstance(results, Table)
self.assertEqual(len(results.rows), 7)
self.assertEqual(len(results.columns), 4)
self.assertSequenceEqual(results._column_names, ('test', 'letter', 'count', 'number_sum'))

self.assertSequenceEqual(results.rows[0], ('table1', 'a', 2, 4))
self.assertSequenceEqual(results.rows[1], ('table1', 'b', 1, 2))

self.assertSequenceEqual(results.rows[2], ('table2', 'b', 1, 0))
self.assertSequenceEqual(results.rows[3], ('table2', 'a', 1, 2))
self.assertSequenceEqual(results.rows[4], ('table2', 'c', 1, 5))

self.assertSequenceEqual(results.rows[5], ('table3', 'a', 2, 3))
self.assertSequenceEqual(results.rows[6], ('table3', 'c', 1, 3))

0 comments on commit 2efc6aa

Please sign in to comment.