In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('sample_sets/sample-sales.csv')

In [3]:
df['date'] = pd.to_datetime(df['date'])

In [6]:
#getting the total of the ext price and quantity column as well as the average of the unit price
#done without using agg
print(df[["ext price", "quantity"]].sum())
print(df["unit price"].mean())

ext price    2018784.32
quantity       36463.00
dtype: float64
55.0075266667


In [7]:
#now with agg
df[["ext price", "quantity", "unit price"]].agg(['sum', 'mean'])

Unnamed: 0,ext price,quantity,unit price
sum,2018784.0,36463.0,82511.29
mean,1345.856,24.308667,55.007527


In [8]:
#getting the sum of unit price isn't useful
#to solve for this issue, agg allows you to pass a dictionary to tell it what functions to apply to what
df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean']})

Unnamed: 0,unit price,ext price,quantity
mean,55.007527,1345.856,24.308667
sum,,2018784.0,36463.0


In [9]:
df.head(1)

Unnamed: 0,account number,name,sku,quantity,unit price,ext price,date
0,740150,Barton LLC,B1-20000,39,86.69,3380.91,2014-01-01 07:21:51


In [14]:
df['sku'].value_counts().index[0]

'S2-77896'

In [15]:
#agg also allows for applying your own functions
#the function we'll be applying
get_max = lambda x: x.value_counts(dropna=False).index[0]

#breaking it down
#this will get the count that each sku appears
print(df['sku'].value_counts()[0:3])
#this will return the max number of appearances
print(df['sku'].value_counts().max())
#because value counts is always in descending order, you can access the first value via its index position
print(df['sku'].value_counts().index[0])

S2-77896    73
S1-82801    60
S2-10342    59
Name: sku, dtype: int64
73
S2-77896


In [16]:
#you can now apply your own function to include those results with your other summary data
df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})

Unnamed: 0,unit price,sku,ext price,quantity
<lambda>,,S2-77896,,
mean,55.007527,,1345.856,24.308667
sum,,,2018784.0,36463.0


In [17]:
#having lambda in your summary table doesn't do a great job explaining what that data point is telling you
#to change this you can explicity change the name of a function by doing the following
get_max.__name__ = "most frequent"

In [18]:
df.agg({'ext price': ['sum', 'mean'], 'quantity': ['sum', 'mean'], 'unit price': ['mean'], 'sku': [get_max]})

Unnamed: 0,unit price,sku,ext price,quantity
mean,55.007527,,1345.856,24.308667
most frequent,,S2-77896,,
sum,,,2018784.0,36463.0


In [19]:
#one drawback of agg is that it doesn't preserve order
#to combat this, you can use collections to create an ordered dict
import collections
f = collections.OrderedDict([('ext price', ['sum', 'mean']), ('quantity', ['sum', 'mean']), ('sku', [get_max])])
df.agg(f)

Unnamed: 0,ext price,quantity,sku
mean,1345.856,24.308667,
most frequent,,,S2-77896
sum,2018784.0,36463.0,
