## Examples of aggregating and grouping data

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
input_file = Path.cwd() / 'data' / 'raw' / 'sample_sales.xlsx'

In [3]:
df = pd.read_excel(input_file)

In [4]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [5]:
df['price'].agg(['mean', 'std', 'min', 'max'])

mean    22.816000
std      7.537039
min     10.000000
max     35.000000
Name: price, dtype: float64

In [6]:
# Updated from recording due to changes to Pandas API in 2.0
# See this post for details: https://levelup.gitconnected.com/welcoming-pandas-2-0-194094e4275b
# df.agg(['mean', 'max'])
df[['price', 'quantity', 'extended amount']].agg(['mean', 'max'])

Unnamed: 0,price,quantity,extended amount
mean,22.816,22.421,510.27
max,35.0,50.0,1715.0


In [7]:
agg_cols = {'quantity': 'sum',
            'price': ['mean', 'std'],
            'invoice': 'count',
            'extended amount': 'sum'}
df.agg(agg_cols).fillna(0)


Unnamed: 0,quantity,price,invoice,extended amount
sum,22421.0,0.0,0.0,510270.0
mean,0.0,22.816,0.0,0.0
std,0.0,7.537039,0.0,0.0
count,0.0,0.0,1000.0,0.0


In [8]:
df.groupby(['product']).sum()

TypeError: datetime64 type does not support sum operations

In [None]:
df.groupby(['product'])['quantity'].sum()

In [None]:
prod_cols = {'quantity': 'sum'}
df.groupby(['product']).agg(prod_cols)

In [None]:
prod_cols = {'quantity': ['sum', 'mean', 'std', 'max']}
df.groupby(['product']).agg(prod_cols)

In [None]:
df.groupby(['company', 'product']).agg(prod_cols).fillna(0)

In [None]:
df.groupby(['company', 'product']).agg(prod_cols).reset_index()

In [None]:
df.groupby(['company']).agg(invoice_total=('invoice', 'count'),
                           max_purchase=('extended amount', 'max'))

In [None]:
df.groupby(['company']).agg({'invoice': 'count',
                             'extended amount': 'max'})

## Pivot table and crosstab

In [None]:
pd.pivot_table(df, index=['company'], columns=['product'],
              values=['extended amount'],
              aggfunc='sum',
              margins=True,
              fill_value=0)

In [None]:
pd.pivot_table(df, index=['company'], columns=['product'],
              values=['extended amount'],
              aggfunc=['sum', 'mean', 'max'],
              margins=True,
              fill_value=0)

In [None]:
pd.pivot_table(df, index=['company', 'product'],
              values=['extended amount'],
              aggfunc=['sum'],
              margins=True,
              fill_value=0)

In [None]:
pd.crosstab(df['company'], df['product'])

In [None]:
pd.crosstab(df['company'], df['product'], values=df['extended amount'], aggfunc='sum', normalize='index')