In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.DataFrame(np.random.randn(6, 3), columns=['A', 'B', 'C'], index=[[10, 10, 20, 20, 30, 30], [100, 200, 100, 200, 100, 200]])
df.index.names = ['lev_0', 'lev_1']
df.columns.names = ['col_0']

In [3]:
df

Unnamed: 0_level_0,col_0,A,B,C
lev_0,lev_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,100,0.335925,-0.662933,0.159418
10,200,-0.344063,0.063867,-1.684276
20,100,-0.193312,-0.631889,0.523957
20,200,-1.610473,-0.20607,0.281418
30,100,0.797771,1.376906,0.351149
30,200,0.256967,-0.403382,0.510767


In [4]:
df.groupby('lev_0').sum()

col_0,A,B,C
lev_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10,-0.008137,-0.599066,-1.524857
20,-1.803785,-0.837959,0.805375
30,1.054738,0.973524,0.861916


In [5]:
df['AA'] = df['A'].apply(lambda x: 1 if x >=0 else -1)

In [6]:
df.groupby('AA').sum()

col_0,A,B,C
AA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,-2.147848,-0.774092,-0.8789
1,1.390663,0.310591,1.021334


### Transformation

In [11]:
nrows = 1100
index = pd.date_range('10/1/1999', periods=nrows)
ts = pd.Series(np.random.normal(0.5, 2, nrows), index)

ts = ts.rolling(window=100,min_periods=100).mean().dropna()
print(ts.head())
print('\n')
print(ts.tail())

2000-01-08    0.590763
2000-01-09    0.577147
2000-01-10    0.555882
2000-01-11    0.537729
2000-01-12    0.480037
Freq: D, dtype: float64


2002-09-30    0.248637
2002-10-01    0.289681
2002-10-02    0.268917
2002-10-03    0.303958
2002-10-04    0.314647
Freq: D, dtype: float64


In [16]:
key = lambda x: x.year
grouped = ts.groupby(key)
print(grouped.mean())
print('\n')
print(grouped.std())

2000    0.515240
2001    0.411192
2002    0.411169
dtype: float64


2000    0.107156
2001    0.194740
2002    0.110188
dtype: float64


In [17]:
zscore = lambda x: (x - x.mean()) / x.std()
transformed = ts.groupby(key).transform(zscore)
print(transformed.head())
print('\n')
print(transformed.tail())

2000-01-08    0.704791
2000-01-09    0.577724
2000-01-10    0.379281
2000-01-11    0.209874
2000-01-12   -0.328515
Freq: D, dtype: float64


2002-09-30   -1.475040
2002-10-01   -1.102553
2002-10-02   -1.290989
2002-10-03   -0.972981
2002-10-04   -0.875978
Freq: D, dtype: float64


In [18]:
grouped_trans = transformed.groupby(key)
print(grouped_trans.mean())
print('\n')
print(grouped_trans.std())

2000   -2.003968e-16
2001   -2.372531e-16
2002    1.935876e-16
dtype: float64


2000    1.0
2001    1.0
2002    1.0
dtype: float64


### Filtration (like having() in sql)