In [141]:
import pandas as pd
import numpy as np

In [142]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

# Rolling mean by Pandas DataFrame Groups

Let say in a Pandas DataFrame we have a date column, several categorical columns and a target column.
We would like to compute a rolling mean of the target, for one or two categories, and for a specific date window.

First we will create some sample data, then we review the normal Pandas DataFrame rolling method, and finally we will try to do rolling by groups.

## Sample data

In [143]:
# We will make a DataFrame of this number of rows
n_row = 1000
n_date = 30

30

In [144]:
# Set the dates of interest
start_date = pd.Timestamp(year=2022, month=1, day=1)
one_day = pd.Timedelta('1D')
# all_dates_full = [start_date + k * one_day for k in range(n_date)]
all_dates_full = pd.date_range(start=start_date, periods=n_date)
all_dates = np.random.choice(all_dates_full, size=n_date*4//5, replace=False)
all_dates.sort()
pd.Series(all_dates) # .sort_values()

0    2022-01-01
1    2022-01-03
2    2022-01-04
3    2022-01-05
4    2022-01-06
5    2022-01-07
6    2022-01-08
7    2022-01-10
8    2022-01-11
9    2022-01-12
10   2022-01-13
11   2022-01-15
12   2022-01-16
13   2022-01-18
14   2022-01-20
15   2022-01-21
16   2022-01-22
17   2022-01-23
18   2022-01-24
19   2022-01-25
20   2022-01-26
21   2022-01-27
22   2022-01-29
23   2022-01-30
dtype: datetime64[ns]

In [145]:
# A category (group)
groups = list("ABC")

['A', 'B', 'C']

In [146]:
# The target
targets = [0, 1]

[0, 1]

In [147]:
df = pd.DataFrame(dict(
    ts=pd.Series(np.random.choice(all_dates, n_row)),
    gr=pd.Series(np.random.choice(groups, n_row)),
    y=pd.Series(np.random.choice(targets, n_row)),
)).sort_values(["ts", "gr"])

Unnamed: 0,ts,gr,y
4,2022-01-01,A,1
108,2022-01-01,A,1
117,2022-01-01,A,1
143,2022-01-01,A,1
291,2022-01-01,A,1
...,...,...,...
680,2022-01-30,C,1
766,2022-01-30,C,0
859,2022-01-30,C,1
875,2022-01-30,C,0


## Normal Pandas rolling

Now look at how Pandas can do rolling.

In [148]:
df.set_index("ts").rolling("2D")["y"].mean()

ts
2022-01-01    1.000000
2022-01-01    1.000000
2022-01-01    1.000000
2022-01-01    1.000000
2022-01-01    1.000000
                ...   
2022-01-30    0.555556
2022-01-30    0.548780
2022-01-30    0.554217
2022-01-30    0.547619
2022-01-30    0.541176
Name: y, Length: 1000, dtype: float64

In [149]:
df_gr = df.groupby(["gr", "ts"])["y"].agg(["sum", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,count
gr,ts,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2022-01-01,11,13
A,2022-01-03,5,11
A,2022-01-04,4,13
A,2022-01-05,2,10
A,2022-01-06,7,14
...,...,...,...
C,2022-01-25,8,18
C,2022-01-26,3,13
C,2022-01-27,10,21
C,2022-01-29,8,15


As you can see above, it is easy and fast to do normal groupby aggregation.
Now, to compute the rolling mean for group A for 1 week, we have to consider both the mean for each day and the corresponding count.

In [150]:
df_gr.loc["A"]

Unnamed: 0_level_0,sum,count
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-01-01,11,13
2022-01-03,5,11
2022-01-04,4,13
2022-01-05,2,10
2022-01-06,7,14
2022-01-07,6,12
2022-01-08,12,17
2022-01-10,6,14
2022-01-11,6,17
2022-01-12,9,21


In [151]:
dfa = df_gr.loc["A"].reindex(all_dates_full).fillna(0)

Unnamed: 0,sum,count
2022-01-01,11.0,13.0
2022-01-02,0.0,0.0
2022-01-03,5.0,11.0
2022-01-04,4.0,13.0
2022-01-05,2.0,10.0
2022-01-06,7.0,14.0
2022-01-07,6.0,12.0
2022-01-08,12.0,17.0
2022-01-09,0.0,0.0
2022-01-10,6.0,14.0


In [152]:
dfa_roll = pd.concat([dfa, dfa.rolling("2D").sum().rename(columns=dict(sum="sum_roll", count="count_roll"))], axis=1)


Unnamed: 0,sum,count,sum_roll,count_roll
2022-01-01,11.0,13.0,11.0,13.0
2022-01-02,0.0,0.0,11.0,13.0
2022-01-03,5.0,11.0,5.0,11.0
2022-01-04,4.0,13.0,9.0,24.0
2022-01-05,2.0,10.0,6.0,23.0
2022-01-06,7.0,14.0,9.0,24.0
2022-01-07,6.0,12.0,13.0,26.0
2022-01-08,12.0,17.0,18.0,29.0
2022-01-09,0.0,0.0,12.0,17.0
2022-01-10,6.0,14.0,6.0,14.0


In [153]:
dfa_roll["mean_roll"] = (dfa_roll["sum_roll"]/dfa_roll.loc[:, "count_roll"]).fillna(0)

In [154]:
dfa_roll

Unnamed: 0,sum,count,sum_roll,count_roll,mean_roll
2022-01-01,11.0,13.0,11.0,13.0,0.846154
2022-01-02,0.0,0.0,11.0,13.0,0.846154
2022-01-03,5.0,11.0,5.0,11.0,0.454545
2022-01-04,4.0,13.0,9.0,24.0,0.375
2022-01-05,2.0,10.0,6.0,23.0,0.26087
2022-01-06,7.0,14.0,9.0,24.0,0.375
2022-01-07,6.0,12.0,13.0,26.0,0.5
2022-01-08,12.0,17.0,18.0,29.0,0.62069
2022-01-09,0.0,0.0,12.0,17.0,0.705882
2022-01-10,6.0,14.0,6.0,14.0,0.428571


In [156]:
dfa_roll["mean_roll"].shift(periods=1, fill_value=0)

2022-01-01    0.000000
2022-01-02    0.846154
2022-01-03    0.846154
2022-01-04    0.454545
2022-01-05    0.375000
2022-01-06    0.260870
2022-01-07    0.375000
2022-01-08    0.500000
2022-01-09    0.620690
2022-01-10    0.705882
2022-01-11    0.428571
2022-01-12    0.387097
2022-01-13    0.394737
2022-01-14    0.484848
2022-01-15    0.583333
2022-01-16    0.545455
2022-01-17    0.560000
2022-01-18    0.571429
2022-01-19    0.636364
2022-01-20    0.636364
2022-01-21    0.666667
2022-01-22    0.560000
2022-01-23    0.484848
2022-01-24    0.576923
2022-01-25    0.608696
2022-01-26    0.466667
2022-01-27    0.548387
2022-01-28    0.565217
2022-01-29    0.375000
2022-01-30    0.562500
Freq: D, Name: mean_roll, dtype: float64