Skip to content

Commit

Permalink
flow.GroupBy allows a tuple argument *group_by* to combine several ha…
Browse files Browse the repository at this point in the history
…sh functions.
  • Loading branch information
ynikitenko committed Sep 22, 2023
1 parent b9fd312 commit 4cb845a
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 29 deletions.
83 changes: 57 additions & 26 deletions lena/flow/group_by.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Group data using :class:`.GroupBy` class."""
import lena.core
import lena.flow
from lena.core import LenaTypeError, LenaKeyError, LenaValueError
from lena.context import format_context
from lena.flow import get_context


class GroupBy(object):
Expand All @@ -18,43 +19,68 @@ def __init__(self, group_by):
It can be also a dot-separated formatting string.
In that case only the context part of the value is used
(see :func:`context.format_context <.format_context>`).
If *group_by* is not a callable or a string,
:exc:`.LenaTypeError` is raised.
*group_by* can be a tuple of strings or callables.
In that case the hash value will be combined from each
part of the tuple. A tuple may be used when not all parts
of context can be always rendered (that would lead to an error
or an empty string if they were combined
into one formatting string).
"""
self.groups = dict()
if callable(group_by):
# callable(value) is allowed for generality.
# I use group_by exclusively with context,
# and the only example I can imagine when it can probe value
# is histograms with same variables
# but with different ranges (one wouldn't be able
# to plot graphs with them without changing context though).
# This is a weak example, because this information
# could be added to context.
self._group_by = group_by
elif isinstance(group_by, str):
fc = lena.context.format_context(group_by)
self._group_by = lambda val: fc(lena.flow.get_context(val))
# since context is always coupled with data here
# (or if data would be solely used as a key),
# we allow a general callable.
def make_grpby(group_by):
if callable(group_by):
return group_by
elif isinstance(group_by, str):
fc = format_context(group_by)
return lambda val: fc(get_context(val))
else:
raise LenaTypeError(
"group_by must be a callable or a string, "
"{} provided".format(group_by)
)

def make_tupbg(group_bys):
def tupgb(val):
group = []
for gb in group_bys:
try:
key = gb(val)
except LenaKeyError:
key = ""
group.append(key)
if not any(group):
raise LenaValueError(
"no key found for {}".format(val)
)
return tuple(group)
return tupgb

if isinstance(group_by, tuple):
group_bys = [make_grpby(gb) for gb in group_by]
self._group_by = make_tupbg(group_bys)
else:
raise lena.core.LenaTypeError(
"group_by must be a callable or a string, "
"{} provided".format(group_by)
)
self._group_by = make_grpby(group_by)

# for equality testing
self._init_group_by = group_by

def update(self, val):
"""Find a group for *val* and add it there.
A group key is calculated by *group_by*.
If no such key exists, a new group is created.
If a formatting key was not found for *val*,
:exc:`~LenaValueError` is raised.
If a formatting key was not found for *val*
(or if no values for a tuple *group_by* could produce keys)
:exc:`.LenaValueError` is raised.
"""
try:
key = self._group_by(val)
except lena.core.LenaKeyError:
raise lena.core.LenaValueError(
except LenaKeyError:
raise LenaValueError(
"could not find a key for {}".format(val)
)

Expand All @@ -66,3 +92,8 @@ def update(self, val):
def clear(self):
"""Remove all groups."""
self.groups.clear()

def __eq__(self, other):
if not isinstance(other, GroupBy):
return NotImplemented
return self._init_group_by == other._init_group_by
20 changes: 17 additions & 3 deletions tests/flow/test_group_by.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

import lena.core
from lena.core import LenaTypeError, LenaValueError
from lena.flow import GroupBy


Expand All @@ -21,7 +21,7 @@ def test_group_by():
assert len(g0.groups) == 0

## wrong initialization parameter raises
with pytest.raises(lena.core.LenaTypeError):
with pytest.raises(LenaTypeError):
GroupBy(1)

## context string works
Expand All @@ -48,7 +48,7 @@ def test_group_by():
)

# missing context raises
with pytest.raises(lena.core.LenaValueError):
with pytest.raises(LenaValueError):
GroupBy("{{non_existent}}").update(data2)

# several subcontexts work
Expand All @@ -61,3 +61,17 @@ def test_group_by():
'variable': {'name': 'mean'}})
]
}

# tuple works
g3 = GroupBy(("{{value.variable.name}}", "{{variable.name}}"))
assert g3._group_by(data2) == ('x', 'mean')
assert g3._group_by((None, {"variable": {"name": "mean"}})) == ('', 'mean')
# all empty keys raise
with pytest.raises(LenaValueError) as err:
g3._group_by(data1)
assert "no key found" in str(err.value)

# equality testing works
g4 = GroupBy(("{{value.variable.name}}", "{{variable.name}}"))
assert g4 == g3
assert g4 != g2

0 comments on commit 4cb845a

Please sign in to comment.