## Examples of aggregating and grouping data

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
input_file = Path.cwd() / 'data' / 'raw' / 'sample_sales.xlsx'

In [3]:
df = pd.read_excel(input_file)

In [4]:
df.head()

Unnamed: 0,invoice,company,purchase_date,product,quantity,price,extended amount
0,ZN-870-29,Realcube,2019-03-05,shirt,19,17,323
1,JQ-501-63,Zooxo,2019-07-09,book,30,14,420
2,FI-165-58,Dabtype,2019-08-12,poster,7,23,161
3,XP-005-55,Skipfire,2019-11-18,pen,7,29,203
4,NB-917-18,Bluezoom,2019-04-18,poster,36,19,684


In [5]:
df['price'].agg(['mean', 'std', 'min', 'max'])

mean    22.816000
std      7.537039
min     10.000000
max     35.000000
Name: price, dtype: float64

In [6]:
# Updated from recording due to changes to Pandas API in 2.0
# See this post for details: https://levelup.gitconnected.com/welcoming-pandas-2-0-194094e4275b

# df.agg(['mean', 'max'])

df[['price', 'quantity', 'extended amount']].agg(['mean', 'max'])

Unnamed: 0,price,quantity,extended amount
mean,22.816,22.421,510.27
max,35.0,50.0,1715.0


In [7]:
agg_cols = {'quantity': 'sum',
            'price': ['mean', 'std'],
            'invoice': 'count',
            'extended amount': 'sum'}
df.agg(agg_cols).fillna(0)


Unnamed: 0,quantity,price,invoice,extended amount
sum,22421.0,0.0,0.0,510270.0
mean,0.0,22.816,0.0,0.0
std,0.0,7.537039,0.0,0.0
count,0.0,0.0,1000.0,0.0


In [8]:
# As of Pandas 2.0 release, this code is no longer valid. 
# See https://levelup.gitconnected.com/welcoming-pandas-2-0-194094e4275b for details.

# df.groupby(['product']).sum()

# New, Pandas 2 version:
prod_cols = {'quantity': 'sum'}
df.groupby(['product']).agg(prod_cols)

Unnamed: 0_level_0,quantity
product,Unnamed: 1_level_1
book,5340
pen,5005
poster,5827
shirt,6249


In [9]:
df.groupby(['product'])['quantity'].sum()

product
book      5340
pen       5005
poster    5827
shirt     6249
Name: quantity, dtype: int64

In [10]:
prod_cols = {'quantity': 'sum'}
df.groupby(['product']).agg(prod_cols)

Unnamed: 0_level_0,quantity
product,Unnamed: 1_level_1
book,5340
pen,5005
poster,5827
shirt,6249


In [11]:
prod_cols = {'quantity': ['sum', 'mean', 'std', 'max']}
df.groupby(['product']).agg(prod_cols)

Unnamed: 0_level_0,quantity,quantity,quantity,quantity
Unnamed: 0_level_1,sum,mean,std,max
product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
book,5340,22.820513,15.472315,50
pen,5005,22.146018,15.840059,50
poster,5827,21.66171,16.427386,50
shirt,6249,23.059041,17.085521,50


In [12]:
df.groupby(['company', 'product']).agg(prod_cols).fillna(0)

Unnamed: 0_level_0,Unnamed: 1_level_0,quantity,quantity,quantity,quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,std,max
company,product,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Abatz,book,64,21.333333,25.501634,47
Abatz,pen,7,7.000000,0.000000,7
Abatz,poster,39,39.000000,0.000000,39
Agivu,book,11,11.000000,0.000000,11
Agivu,shirt,20,20.000000,0.000000,20
...,...,...,...,...,...
Zooxo,book,30,30.000000,0.000000,30
Zooxo,shirt,85,42.500000,2.121320,44
Zoozzy,pen,31,31.000000,0.000000,31
Zoozzy,poster,31,15.500000,21.920310,31


In [13]:
df.groupby(['company', 'product']).agg(prod_cols).reset_index()

Unnamed: 0_level_0,company,product,quantity,quantity,quantity,quantity
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean,std,max
0,Abatz,book,64,21.333333,25.501634,47
1,Abatz,pen,7,7.000000,,7
2,Abatz,poster,39,39.000000,,39
3,Agivu,book,11,11.000000,,11
4,Agivu,shirt,20,20.000000,,20
...,...,...,...,...,...,...
726,Zooxo,book,30,30.000000,,30
727,Zooxo,shirt,85,42.500000,2.121320,44
728,Zoozzy,pen,31,31.000000,,31
729,Zoozzy,poster,31,15.500000,21.920310,31


In [14]:
df.groupby(['company']).agg(invoice_total=('invoice', 'count'),
                           max_purchase=('extended amount', 'max'))

Unnamed: 0_level_0,invoice_total,max_purchase
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Abatz,5,1410
Agivu,2,700
Aibox,2,828
Ailane,3,400
Aimbo,3,570
...,...,...
Zoonoodle,3,644
Zooveo,4,609
Zoovu,2,165
Zooxo,3,968


In [15]:
df.groupby(['company']).agg({'invoice': 'count',
                             'extended amount': 'max'})

Unnamed: 0_level_0,invoice,extended amount
company,Unnamed: 1_level_1,Unnamed: 2_level_1
Abatz,5,1410
Agivu,2,700
Aibox,2,828
Ailane,3,400
Aimbo,3,570
...,...,...
Zoonoodle,3,644
Zooveo,4,609
Zoovu,2,165
Zooxo,3,968


## Pivot table and crosstab

In [16]:
pd.pivot_table(df, index=['company'], columns=['product'],
              values=['extended amount'],
              aggfunc='sum',
              margins=True,
              fill_value=0)

Unnamed: 0_level_0,extended amount,extended amount,extended amount,extended amount,extended amount
product,book,pen,poster,shirt,All
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Abatz,2063,140,1248,0,3451
Agivu,385,0,0,700,1085
Aibox,30,828,0,0,858
Ailane,400,-105,0,0,295
Aimbo,0,438,0,-165,273
...,...,...,...,...,...
Zooveo,0,113,609,377,1099
Zoovu,165,0,0,-56,109
Zooxo,420,0,0,1378,1798
Zoozzy,0,527,620,437,1584


In [17]:
pd.pivot_table(df, index=['company'], columns=['product'],
              values=['extended amount'],
              aggfunc=['sum', 'mean', 'max'],
              margins=True,
              fill_value=0)

Unnamed: 0_level_0,sum,sum,sum,sum,sum,mean,mean,mean,mean,mean,max,max,max,max,max
Unnamed: 0_level_1,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount,extended amount
product,book,pen,poster,shirt,All,book,pen,poster,shirt,All,book,pen,poster,shirt,All
company,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3
Abatz,2063,140,1248,0,3451,687.666667,140.000000,1248.000000,0.000000,690.200000,1410,140,1248,0,1410
Agivu,385,0,0,700,1085,385.000000,0.000000,0.000000,700.000000,542.500000,385,0,0,700,700
Aibox,30,828,0,0,858,30.000000,828.000000,0.000000,0.000000,429.000000,30,828,0,0,828
Ailane,400,-105,0,0,295,400.000000,-105.000000,0.000000,0.000000,98.333333,400,-105,0,0,400
Aimbo,0,438,0,-165,273,0.000000,219.000000,0.000000,-165.000000,91.000000,0,570,0,-165,570
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zooveo,0,113,609,377,1099,0.000000,56.500000,609.000000,377.000000,274.750000,0,143,609,377,609
Zoovu,165,0,0,-56,109,165.000000,0.000000,0.000000,-56.000000,54.500000,165,0,0,-56,165
Zooxo,420,0,0,1378,1798,420.000000,0.000000,0.000000,689.000000,599.333333,420,0,0,968,968
Zoozzy,0,527,620,437,1584,0.000000,527.000000,310.000000,437.000000,396.000000,0,527,620,437,620


In [18]:
pd.pivot_table(df, index=['company', 'product'],
              values=['extended amount'],
              aggfunc=['sum'],
              margins=True,
              fill_value=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,extended amount
company,product,Unnamed: 2_level_2
Abatz,book,2063
Abatz,pen,140
Abatz,poster,1248
Agivu,book,385
Agivu,shirt,700
...,...,...
Zooxo,shirt,1378
Zoozzy,pen,527
Zoozzy,poster,620
Zoozzy,shirt,437


In [19]:
pd.crosstab(df['company'], df['product'])

product,book,pen,poster,shirt
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abatz,3,1,1,0
Agivu,1,0,0,1
Aibox,1,1,0,0
Ailane,1,1,1,0
Aimbo,0,2,0,1
...,...,...,...,...
Zoonoodle,1,1,0,1
Zooveo,0,2,1,1
Zoovu,1,0,0,1
Zooxo,1,0,0,2


In [20]:
pd.crosstab(df['company'], df['product'], values=df['extended amount'], aggfunc='sum', normalize='index')

product,book,pen,poster,shirt
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Abatz,0.597798,0.040568,0.361634,0.000000
Agivu,0.354839,0.000000,0.000000,0.645161
Aibox,0.034965,0.965035,0.000000,0.000000
Ailane,1.355932,-0.355932,0.000000,0.000000
Aimbo,0.000000,1.604396,0.000000,-0.604396
...,...,...,...,...
Zoonoodle,0.356322,0.435429,0.000000,0.208249
Zooveo,0.000000,0.102821,0.554140,0.343039
Zoovu,1.513761,0.000000,0.000000,-0.513761
Zooxo,0.233593,0.000000,0.000000,0.766407
