In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Set paths
# Working from the Scripts folder as the base path
exercise_path = '../Python/Exercise Files/Pandas Course Resources/Pandas Course Resources/retail/'

### Aggregation
`df_data.groupby('col1')[['do_sometthing_col']].mean()` - this will return a data frame, single `[ ]` will return a series

In [3]:
# Load transactions data
transactions = pd.read_csv(os.path.join(exercise_path, "transactions.csv"), parse_dates=['date'])

In [4]:
# Group transactions by store and return the top 10 stores
transactions.groupby('store_nbr')[['transactions']].sum().sort_values(['transactions'], ascending=False).head(10)

Unnamed: 0_level_0,transactions
store_nbr,Unnamed: 1_level_1
44,7273093
47,6535810
45,6201115
46,5990113
3,5366350
48,5107785
8,4637971
49,4574103
50,4384444
11,3972488


In [5]:
transactions.head()

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922


In [6]:
#transactions = transactions.assign(month = transactions.date.dt.month)
#transactions.info()

transactions["month"] = transactions["date"].dt.month

transactions.head()
#transactions.groupby(['store_nbr', 'month'])[['transactions']].sum().sort_values(by = ['month', 'transactions'], ascending=[True, False])

Unnamed: 0,date,store_nbr,transactions,month
0,2013-01-01,25,770,1
1,2013-01-02,1,2111,1
2,2013-01-02,2,2358,1
3,2013-01-02,3,3487,1
4,2013-01-02,4,1922,1


In [7]:
# Wrap code in parath to have cleaner looking code
(transactions
 .groupby(['store_nbr', 'month'])[['transactions']]
 .sum()
 .sort_values(by = ['month', 'transactions'], ascending=[True, False]
 ).reset_index() # Convert multicolumns to indices)
).head(10)

Unnamed: 0,store_nbr,month,transactions
0,44,1,628438
1,47,1,568824
2,45,1,538370
3,46,1,522763
4,3,1,463260
5,48,1,439045
6,8,1,404463
7,49,1,386589
8,50,1,372093
9,11,1,336187


In [8]:

grouped = (
    transactions.groupby(["store_nbr", "month"])
    .agg({"transactions": ["sum", "mean"]})
    .sort_values(by=["month", ("transactions", "sum")], ascending=[True, False])
)

In [9]:
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,transactions,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean
store_nbr,month,Unnamed: 2_level_2,Unnamed: 3_level_2
44,1,628438,4246.202703
47,1,568824,3843.405405
45,1,538370,3637.635135
46,1,522763,3532.182432
3,1,463260,3151.428571
...,...,...,...
32,12,86167,718.058333
21,12,84128,1402.133333
42,12,76741,1279.016667
29,12,76627,1277.116667


In [10]:
grouped.reset_index()

Unnamed: 0_level_0,store_nbr,month,transactions,transactions
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,mean
0,44,1,628438,4246.202703
1,47,1,568824,3843.405405
2,45,1,538370,3637.635135
3,46,1,522763,3532.182432
4,3,1,463260,3151.428571
...,...,...,...,...
636,32,12,86167,718.058333
637,21,12,84128,1402.133333
638,42,12,76741,1279.016667
639,29,12,76627,1277.116667


In [11]:
grouped.loc[(3, 1)]

transactions  sum     463260.000000
              mean      3151.428571
Name: (3, 1), dtype: float64

In [12]:
grouped.iloc[4]

transactions  sum     463260.000000
              mean      3151.428571
Name: (3, 1), dtype: float64

In [13]:
grouped.droplevel(0, axis = 1).reset_index().head(10)

Unnamed: 0,store_nbr,month,sum,mean
0,44,1,628438,4246.202703
1,47,1,568824,3843.405405
2,45,1,538370,3637.635135
3,46,1,522763,3532.182432
4,3,1,463260,3151.428571
5,48,1,439045,2966.52027
6,8,1,404463,2751.44898
7,49,1,386589,2612.087838
8,50,1,372093,2514.141892
9,11,1,336187,2286.986395


In [14]:
grouped.droplevel(0, axis = 1).reset_index().head(10)

Unnamed: 0,store_nbr,month,sum,mean
0,44,1,628438,4246.202703
1,47,1,568824,3843.405405
2,45,1,538370,3637.635135
3,46,1,522763,3532.182432
4,3,1,463260,3151.428571
5,48,1,439045,2966.52027
6,8,1,404463,2751.44898
7,49,1,386589,2612.087838
8,50,1,372093,2514.141892
9,11,1,336187,2286.986395


### .agg() Method

In [15]:

# Recreate table from section 3
transactions = transactions.assign(
    target_pct=transactions["transactions"] / 2500,
    met_target=(transactions["transactions"] / 2500) >= 1,
    bonus_payable=((transactions["transactions"] / 2500) >= 1) * 100,
    month=transactions.date.dt.month,
    day_of_week=transactions.date.dt.dayofweek,
)

transactions.head()


Unnamed: 0,date,store_nbr,transactions,month,target_pct,met_target,bonus_payable,day_of_week
0,2013-01-01,25,770,1,0.308,False,0,1
1,2013-01-02,1,2111,1,0.8444,False,0,2
2,2013-01-02,2,2358,1,0.9432,False,0,2
3,2013-01-02,3,3487,1,1.3948,True,100,2
4,2013-01-02,4,1922,1,0.7688,False,0,2


In [16]:
# Create some target tables. # Average number of days each store hit the target
(transactions.groupby(['store_nbr'])
 .agg(met_target = ('met_target', 'mean'), bonus_payable = ('bonus_payable', 'sum'))
 .reset_index()
 .sort_values(['bonus_payable', 'met_target'], ascending=[False, True])
)


Unnamed: 0,store_nbr,met_target,bonus_payable
46,47,0.999404,167600
43,44,0.998807,167500
44,45,0.997615,167300
2,3,0.99821,167300
45,46,0.989267,165900
7,8,0.888425,148900
47,48,0.690519,115800
48,49,0.637448,106900
49,50,0.45319,76000
10,11,0.296539,49700


In [17]:
# Create some target tables. # Average number of days each store hit the target
(transactions.groupby(['month'])
 .agg(met_target = ('met_target', 'mean'), bonus_payable = ('bonus_payable', 'sum'))
 .reset_index()
 .sort_values(['bonus_payable', 'met_target'], ascending=[False, True])
)

Unnamed: 0,month,met_target,bonus_payable
11,12,0.25564,154100
4,5,0.170792,131800
2,3,0.169461,130400
3,4,0.174469,129700
6,7,0.162486,126300
5,6,0.161706,121700
1,2,0.17423,121700
7,8,0.174189,120800
0,1,0.163723,119600
10,11,0.163943,98300


In [18]:
(transactions.groupby(['day_of_week'])
 .agg({'met_target':'mean', 'bonus_payable':'sum'})
 .reset_index()
 .sort_values(['bonus_payable', 'met_target'], ascending=[False, True])
)

Unnamed: 0,day_of_week,met_target,bonus_payable
5,5,0.222204,266400
6,6,0.204001,241700
4,4,0.179007,213000
0,0,0.160214,191600
2,2,0.160572,191000
1,1,0.146299,175500
3,3,0.142077,169100


### Pivoting & Melting

In [25]:
# Pivot and Melting Practice
# Keep stores 1-11
trans_pivot = (transactions[transactions['store_nbr'].isin(range(1,12))]
 .query("bonus_payable > 0")
 .pivot_table(index = 'store_nbr', 
              columns = 'day_of_week',
              values = 'bonus_payable',
              aggfunc = 'sum')
)
trans_pivot.style.background_gradient(cmap = "RdYlGn", axis = 1)

day_of_week,0,1,2,3,4,5,6
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,200.0,300.0,300.0,200.0,100.0,,
2,300.0,600.0,500.0,400.0,400.0,500.0,200.0
3,24000.0,23900.0,23900.0,23900.0,23900.0,24000.0,23700.0
4,200.0,300.0,300.0,200.0,100.0,200.0,
5,200.0,300.0,300.0,100.0,100.0,100.0,
6,400.0,500.0,500.0,300.0,200.0,900.0,300.0
7,200.0,300.0,300.0,200.0,100.0,100.0,
8,22000.0,18800.0,23800.0,18000.0,22900.0,23400.0,20000.0
9,1200.0,800.0,800.0,700.0,400.0,7900.0,5100.0
11,3500.0,4800.0,3200.0,3000.0,2000.0,15600.0,17600.0


In [28]:
trans_pivot.reset_index().melt(id_vars = 'store_nbr')


Unnamed: 0,store_nbr,day_of_week,value
0,1,0,200.0
1,2,0,300.0
2,3,0,24000.0
3,4,0,200.0
4,5,0,200.0
...,...,...,...
65,6,6,300.0
66,7,6,
67,8,6,20000.0
68,9,6,5100.0
