In [1]:
import numpy as np
import pandas as pd


#### Tablewise function application
- DataFrames and Series can be passed into functions. However, if the function needs to be called in a chain, consider using the pipe() method.

In [7]:
# First some setup:
def extract_city_name(df):
    """
    Chicago, IL -> Chicago for city_name column
    """
    df["city_name"] = df["city_and_code"].str.split(",").str.get(0)
    return df


def add_country_name(df, country_name=None):
    """
    Chicago -> Chicago-US for city_name column
    """
    col = "city_name"
    df["city_and_country"] = df[col] + country_name
    return df


df_p = pd.DataFrame({"city_and_code": ["Chicago, IL"]})

In [10]:
# extract_city_name and add_country_name are functions taking and returning DataFrames.
add_country_name(extract_city_name(df_p), country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


In [15]:
# Is equivalent to:
df_p.pipe(extract_city_name).pipe(add_country_name, country_name="US")

Unnamed: 0,city_and_code,city_name,city_and_country
0,"Chicago, IL",Chicago,ChicagoUS


#### Aggregation API
- The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. This API is similar across pandas objects, see groupby API, the window API, and the resample API. The entry point for aggregation is DataFrame.aggregate(), or the alias DataFrame.agg().

In [17]:
# We will use a similar starting frame from above:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),
)
tsdf

Unnamed: 0,A,B,C
2000-01-01,-0.095572,-0.588018,-0.877353
2000-01-02,-0.487322,0.582247,-0.496319
2000-01-03,-1.400778,-1.636948,0.520437
2000-01-04,-0.580063,1.202187,0.605456
2000-01-05,0.958874,0.28442,0.714112
2000-01-06,-0.57054,0.905098,-0.015607
2000-01-07,-0.429969,-1.486204,0.229031
2000-01-08,-0.86379,0.156388,0.680966
2000-01-09,0.013736,0.192393,-0.88724
2000-01-10,-0.300766,0.985832,-0.792547


In [18]:
# Using a single function is equivalent to apply(). You can also pass named methods as strings. These will return a Series of the aggregated output:
tsdf.agg("sum")

A   -3.756189
B    0.597394
C   -0.319063
dtype: float64

- Aggregating with multiple functions


In [19]:
tsdf.agg(["sum"])

Unnamed: 0,A,B,C
sum,-3.756189,0.597394,-0.319063


In [20]:
# Multiple functions yield multiple rows:
tsdf.agg([np.sum,np.mean,np.var])

Unnamed: 0,A,B,C
sum,-3.756189,0.597394,-0.319063
mean,-0.375619,0.059739,-0.031906
var,0.377887,0.990686,0.453892


In [22]:
# Passing a lambda function will yield a <lambda> named row:
tsdf.agg(["sum", lambda x: x.mean()])

Unnamed: 0,A,B,C
sum,-3.756189,0.597394,-0.319063
<lambda>,-0.375619,0.059739,-0.031906


In [23]:
# Passing a named function will yield that name for the row:
def mymean(x):
    return x.mean()


tsdf["A"].agg(["sum", mymean])

sum      -3.756189
mymean   -0.375619
Name: A, dtype: float64

#### Aggregating with a dict
- Passing a dictionary of column names to a scalar or a list of scalars, to DataFrame.agg allows you to customize which functions are applied to which columns. Note that the results are not in any particular order, you can use an OrderedDict instead to guarantee ordering.

In [27]:
tsdf.agg({"A": "mean", "B": "sum", "C": np.min})


A   -0.375619
B    0.597394
C   -0.887240
dtype: float64

- Passing a list-like will generate a DataFrame output. You will get a matrix-like output of all of the aggregators. The output will consist of all unique functions. Those that are not noted for a particular column will be NaN:

In [28]:
tsdf.agg({"A": ["mean", "min"], "B": "sum"})

Unnamed: 0,A,B
mean,-0.375619,
min,-1.400778,
sum,,0.597394


#### Mixed dtypes

In [29]:
mdf = pd.DataFrame(
    {
        "A": [1, 2, 3],
        "B": [1.0, 2.0, 3.0],
        "C": ["foo", "bar", "baz"],
        "D": pd.date_range("20130101", periods=3),
    }
)


In [32]:
# When presented with mixed dtypes that cannot aggregate, .agg will only take the valid aggregations. This is similar to how .groupby.agg works.
mdf.agg(["min", "sum"])


  mdf.agg(["min", "sum"])


Unnamed: 0,A,B,C,D
min,1,1.0,bar,2013-01-01
sum,6,6.0,foobarbaz,NaT


#### Custom describe
- With .agg() it is possible to easily create a custom describe function, similar to the built in describe function.

In [33]:
from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)

q_25.__name__ = "25%"

q_75 = partial(pd.Series.quantile, q=0.75)

q_75.__name__ = "75%"

tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"])

Unnamed: 0,A,B,C
count,10.0,10.0,10.0
mean,-0.375619,0.059739,-0.031906
std,0.614725,0.995332,0.673715
min,-1.400778,-1.636948,-0.88724
25%,-0.577682,-0.401917,-0.71849
median,-0.458645,0.238407,0.106712
75%,-0.146871,0.824385,0.584202
max,0.958874,1.202187,0.714112
