In [1]:
# 分组级运算和转换
import numpy as np
import pandas as pd
import statsmodels.api as sm
from pandas import DataFrame, Series

  from pandas.core import datetools


In [2]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : np.random.randn(5),
                'data2' : np.random.randn(5)})
df

Unnamed: 0,data1,data2,key1,key2
0,0.307825,0.217729,a,one
1,0.623577,-0.90272,a,two
2,-0.038196,-1.206849,b,one
3,0.736848,0.69069,b,two
4,-0.267622,-0.203043,a,one


In [3]:
k1_means = df.groupby('key1').mean().add_prefix('mean_')
k1_means

Unnamed: 0_level_0,mean_data1,mean_data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.22126,-0.296011
b,0.349326,-0.258079


In [4]:
pd.merge(df, k1_means, left_on='key1', right_index=True)

Unnamed: 0,data1,data2,key1,key2,mean_data1,mean_data2
0,0.307825,0.217729,a,one,0.22126,-0.296011
1,0.623577,-0.90272,a,two,0.22126,-0.296011
4,-0.267622,-0.203043,a,one,0.22126,-0.296011
2,-0.038196,-1.206849,b,one,0.349326,-0.258079
3,0.736848,0.69069,b,two,0.349326,-0.258079


In [5]:
people = DataFrame(np.random.randn(5, 5),
                   columns=['a', 'b', 'c', 'd', 'e'],
                   index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people

Unnamed: 0,a,b,c,d,e
Joe,-0.575573,-0.264966,0.402191,-0.626705,2.266715
Steve,-2.293446,0.989424,0.447412,0.779285,0.527718
Wes,-1.609951,0.807645,1.383784,-0.402235,1.071397
Jim,0.401508,0.044202,-0.00076,-0.452882,-1.29574
Travis,-0.097253,0.896917,0.176152,0.507412,1.666385


In [6]:
key = ['one', 'two', 'one', 'two', 'one'] # 每一行的名称
people.groupby(key).mean()

Unnamed: 0,a,b,c,d,e
one,-0.760926,0.479865,0.654042,-0.173843,1.668165
two,-0.945969,0.516813,0.223326,0.163202,-0.384011


In [7]:
people.groupby(key).transform(np.mean) # transform变回原来形状，但是用前面分组聚合的值填空。

Unnamed: 0,a,b,c,d,e
Joe,-0.760926,0.479865,0.654042,-0.173843,1.668165
Steve,-0.945969,0.516813,0.223326,0.163202,-0.384011
Wes,-0.760926,0.479865,0.654042,-0.173843,1.668165
Jim,-0.945969,0.516813,0.223326,0.163202,-0.384011
Travis,-0.760926,0.479865,0.654042,-0.173843,1.668165


In [8]:
def demean(arr):
    return arr - arr.mean()
demeaned = people.groupby(key).transform(demean) # 
demeaned

Unnamed: 0,a,b,c,d,e
Joe,0.185353,-0.744831,-0.251851,-0.452862,0.598549
Steve,-1.347477,0.472611,0.224086,0.616084,0.911729
Wes,-0.849026,0.32778,0.729741,-0.228392,-0.596769
Jim,1.347477,-0.472611,-0.224086,-0.616084,-0.911729
Travis,0.663673,0.417051,-0.47789,0.681254,-0.001781


In [9]:
demeaned.groupby(key).mean() # 因为前面每个值都减去了平均值，所以应该是0。显示不为0是因为浮点数计算误差。

Unnamed: 0,a,b,c,d,e
one,-3.700743e-17,3.700743e-17,7.401487e-17,0.0,-1.480297e-16
two,0.0,2.775558e-17,0.0,0.0,0.0


In [10]:
# apply：一般性的“拆分－应用－合并”

In [11]:
tips = pd.read_csv('../data/tips.csv')
tips['tip_pct'] = tips['tip'] / tips['total_bill'] # 新加一列，小费与账单金额的比例。
tips.head()

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
0,16.99,1.01,No,Sun,Dinner,2,0.059447
1,10.34,1.66,No,Sun,Dinner,3,0.160542
2,21.01,3.5,No,Sun,Dinner,3,0.166587
3,23.68,3.31,No,Sun,Dinner,2,0.13978
4,24.59,3.61,No,Sun,Dinner,4,0.146808


In [12]:
def top(df, n=5, column='tip_pct'):
    return df.sort_values(by=column)[-n:] # 获取小费比例最高的n条数据
top(tips, n=3)

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [13]:
tips.groupby('smoker').apply(top, n=2, column='tip_pct') # 先按是否吸烟分组，再分别查看小费比例。

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,149,7.51,2.0,No,Thur,Lunch,2,0.266312
No,232,11.61,3.39,No,Sat,Dinner,2,0.29199
Yes,178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
Yes,172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [14]:
tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,total_bill,tip,smoker,day,time,size,tip_pct
smoker,day,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
No,Fri,94,22.75,3.25,No,Fri,Dinner,2,0.142857
No,Sat,212,48.33,9.0,No,Sat,Dinner,4,0.18622
No,Sun,156,48.17,5.0,No,Sun,Dinner,6,0.103799
No,Thur,142,41.19,5.0,No,Thur,Lunch,5,0.121389
Yes,Fri,95,40.17,4.73,Yes,Fri,Dinner,4,0.11775
Yes,Sat,170,50.81,10.0,Yes,Sat,Dinner,3,0.196812
Yes,Sun,182,45.35,3.5,Yes,Sun,Dinner,3,0.077178
Yes,Thur,197,43.11,5.0,Yes,Thur,Lunch,4,0.115982


In [15]:
result = tips.groupby('smoker')['tip_pct'].describe()
result

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
smoker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
No,151.0,0.159328,0.03991,0.056797,0.136906,0.155625,0.185014,0.29199
Yes,93.0,0.163196,0.085119,0.035638,0.106771,0.153846,0.195059,0.710345


In [16]:
result.unstack('smoker') # 把列添加到索引，作用在smoker外。

       smoker
count  No        151.000000
       Yes        93.000000
mean   No          0.159328
       Yes         0.163196
std    No          0.039910
       Yes         0.085119
min    No          0.056797
       Yes         0.035638
25%    No          0.136906
       Yes         0.106771
50%    No          0.155625
       Yes         0.153846
75%    No          0.185014
       Yes         0.195059
max    No          0.291990
       Yes         0.710345
dtype: float64

In [17]:
result.apply(lambda x: x.describe()) # result.describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
count,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
mean,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
std,41.012193,0.002735,0.031968,0.014961,0.021309,0.001258,0.007103,0.295822
min,93.0,0.159328,0.03991,0.035638,0.106771,0.153846,0.185014,0.29199
25%,107.5,0.160295,0.051212,0.040928,0.114305,0.154291,0.187525,0.396578
50%,122.0,0.161262,0.062514,0.046217,0.121838,0.154735,0.190036,0.501167
75%,136.5,0.162229,0.073817,0.051507,0.129372,0.15518,0.192547,0.605756
max,151.0,0.163196,0.085119,0.056797,0.136906,0.155625,0.195059,0.710345


In [18]:
# 禁止分组键

In [19]:
tips.groupby('smoker', group_keys=False).apply(top) # 禁止构成多重索引

Unnamed: 0,total_bill,tip,smoker,day,time,size,tip_pct
88,24.71,5.85,No,Thur,Lunch,2,0.236746
185,20.69,5.0,No,Sun,Dinner,5,0.241663
51,10.29,2.6,No,Sun,Dinner,2,0.252672
149,7.51,2.0,No,Thur,Lunch,2,0.266312
232,11.61,3.39,No,Sat,Dinner,2,0.29199
109,14.31,4.0,Yes,Sat,Dinner,2,0.279525
183,23.17,6.5,Yes,Sun,Dinner,4,0.280535
67,3.07,1.0,Yes,Sat,Dinner,1,0.325733
178,9.6,4.0,Yes,Sun,Dinner,2,0.416667
172,7.25,5.15,Yes,Sun,Dinner,2,0.710345


In [20]:
# 分位数和桶分析

In [21]:
frame = DataFrame({'data1': np.random.randn(1000),
                   'data2': np.random.randn(1000)})
factor = pd.cut(frame.data1, 4) # 切4份
factor[:5] # 前5个元素所在区间

0    (-1.98, -0.461]
1    (-1.98, -0.461]
2    (-1.98, -0.461]
3    (-1.98, -0.461]
4    (-1.98, -0.461]
Name: data1, dtype: category
Categories (4, interval[float64]): [(-3.505, -1.98] < (-1.98, -0.461] < (-0.461, 1.057] < (1.057, 2.576]]

In [22]:
def get_stats(group):
    return {'min': group.min(),
             'max': group.max(),
             'count': group.count(),
             'mean': group.mean()}
grouped = frame.data2.groupby(factor) # 根据data1的分段，对data2进行分组。
result = grouped.apply(get_stats)
print(result)
print(result.unstack())

data1                 
(-3.505, -1.98]  count     28.000000
                 max        1.874309
                 mean      -0.053760
                 min       -2.035926
(-1.98, -0.461]  count    302.000000
                 max        2.475313
                 mean      -0.009367
                 min       -3.135844
(-0.461, 1.057]  count    542.000000
                 max        3.907300
                 mean       0.043797
                 min       -2.585936
(1.057, 2.576]   count    128.000000
                 max        2.491250
                 mean       0.036609
                 min       -2.149983
Name: data2, dtype: float64
                 count       max      mean       min
data1                                               
(-3.505, -1.98]   28.0  1.874309 -0.053760 -2.035926
(-1.98, -0.461]  302.0  2.475313 -0.009367 -3.135844
(-0.461, 1.057]  542.0  3.907300  0.043797 -2.585936
(1.057, 2.576]   128.0  2.491250  0.036609 -2.149983


In [23]:
grouping = pd.qcut(frame.data1, 10, labels=list('ABCDEFGHIJ')) # False的话就是默认数字编号
grouping.head() # 返回每个元素的区间编号

0    B
1    C
2    B
3    D
4    C
Name: data1, dtype: category
Categories (10, object): [A < B < C < D ... G < H < I < J]

In [24]:
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

Unnamed: 0_level_0,count,max,mean,min
data1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,100.0,2.403635,-0.093973,-2.072895
B,100.0,2.300851,0.075512,-3.135844
C,100.0,2.475313,-0.052444,-2.59966
D,100.0,2.269685,0.113117,-2.014964
E,100.0,2.800909,-0.04581,-1.898793
F,100.0,3.263921,-0.000757,-2.03642
G,100.0,3.9073,0.188097,-2.585936
H,100.0,2.893293,-0.021765,-2.335397
I,100.0,2.49125,0.095277,-1.713401
J,100.0,2.291429,-0.016358,-2.149983


In [25]:
# 示例：用特定于分组的值填充缺失值

In [26]:
s = Series(np.random.randn(6))
s[::2] = np.nan
s.fillna(s.mean()) # 用平均数填充缺失值

0    0.217070
1   -0.489271
2    0.217070
3   -0.738455
4    0.217070
5    1.878936
dtype: float64

In [27]:
states = ['Ohio', 'New York', 'Vermont', 'Florida', 'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4 # 前4个东部州，后4个西部州。
data = Series(np.random.randn(8), index=states)
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data

Ohio         -1.622946
New York     -0.393290
Vermont            NaN
Florida      -0.704855
Oregon        0.249753
Nevada             NaN
California    0.074931
Idaho              NaN
dtype: float64

In [28]:
data.groupby(group_key).mean() # 非NA求平均值

East   -0.907030
West    0.162342
dtype: float64

In [29]:
fill_mean = lambda g: g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean) # 分组之后用每组的平均值填充

Ohio         -1.622946
New York     -0.393290
Vermont      -0.907030
Florida      -0.704855
Oregon        0.249753
Nevada        0.162342
California    0.074931
Idaho         0.162342
dtype: float64

In [30]:
fill_values = {'East': 0.5, 'West': -1} # 指定填充值
fill_func = lambda g: g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

Ohio         -1.622946
New York     -0.393290
Vermont       0.500000
Florida      -0.704855
Oregon        0.249753
Nevada       -1.000000
California    0.074931
Idaho        -1.000000
dtype: float64

In [31]:
# 示例：随机采样和排列

In [32]:
# 红桃（Hearts）、黑桃（Spades）、梅花（Clubs）、方片（Diamonds）
suits = ['H', 'S', 'C', 'D']
card_val = (list(range(1, 11)) + [10] * 3) * 4 # Python3下range是生成器，必须用list显示展开。
base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']
cards = []
for suit in ['H', 'S', 'C', 'D']:
    cards.extend(str(num) + suit for num in base_names)
deck = Series(card_val, index=cards)
deck.head() # 牌面数组

AH    1
2H    2
3H    3
4H    4
5H    5
dtype: int64

In [33]:
def draw(deck, n=5): # 随机抽n张
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

7C     7
3D     3
KH    10
9D     9
2C     2
dtype: int64

In [34]:
# 每种花色中随机抽取两张牌
get_suit = lambda card: card[-1]
deck.groupby(get_suit).apply(draw, n=2) # 默认根据索引排序，索引的最后一个字符是花色。

C  JC    10
   9C     9
D  4D     4
   JD    10
H  5H     5
   3H     3
S  3S     3
   8S     8
dtype: int64

In [35]:
deck.groupby(get_suit, group_keys=False).apply(draw, n=2) # 效果一样，但是不用多重索引。

JC     10
10C    10
JD     10
4D      4
9H      9
10H    10
AS      1
10S    10
dtype: int64

In [36]:
# 示例：分组加权平均数和相关系数

In [37]:
df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,1.592743,0.639899
1,a,0.046043,0.084863
2,a,-0.723238,0.214012
3,a,-0.738769,0.64312
4,b,-0.149265,0.167864
5,b,1.246441,0.884834
6,b,0.559904,0.594575
7,b,0.101432,0.923524


In [38]:
grouped = df.groupby('category')

In [39]:
get_wavg = lambda g: np.average(g['data'], weights=g['weights']) # 求加权平均，weights自动归一化处理。
grouped.apply(get_wavg) # 分组计算

category
a    0.248565
b    0.585195
dtype: float64

In [40]:
close_px = pd.read_csv('../data/stock_px.csv', parse_dates=True, index_col=0)
close_px.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 1990-02-01 to 2011-10-14
Data columns (total 9 columns):
AA      5472 non-null float64
AAPL    5472 non-null float64
GE      5472 non-null float64
IBM     5472 non-null float64
JNJ     5472 non-null float64
MSFT    5472 non-null float64
PEP     5471 non-null float64
SPX     5472 non-null float64
XOM     5472 non-null float64
dtypes: float64(9)
memory usage: 427.5 KB


In [41]:
close_px.head()

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990-02-01,4.98,7.86,2.87,16.79,4.27,0.51,6.04,328.79,6.12
1990-02-02,5.04,8.0,2.87,16.89,4.37,0.51,6.09,330.92,6.24
1990-02-05,5.07,8.18,2.87,17.32,4.34,0.51,6.05,331.85,6.25
1990-02-06,5.01,8.12,2.88,17.56,4.32,0.51,6.15,329.66,6.23
1990-02-07,5.04,7.77,2.91,17.93,4.38,0.51,6.17,333.75,6.33


In [42]:
rets = close_px.pct_change().dropna() # 扔掉有空数据的行
spx_corr = lambda x: x.corrwith(x['SPX']) # 与SPX的相关系数
by_year = rets.groupby(lambda x: x.year) # 指定用那个函数去做group
by_year.apply(spx_corr) # 按年分组并计算与SPX的相关系数

Unnamed: 0,AA,AAPL,GE,IBM,JNJ,MSFT,PEP,SPX,XOM
1990,0.595024,0.545067,0.752187,0.738361,0.801145,0.586691,0.783168,1.0,0.517586
1991,0.453574,0.365315,0.759607,0.557046,0.646401,0.524225,0.641775,1.0,0.569335
1992,0.39818,0.498732,0.632685,0.262232,0.51574,0.492345,0.473871,1.0,0.318408
1993,0.259069,0.238578,0.447257,0.211269,0.451503,0.425377,0.385089,1.0,0.318952
1994,0.428549,0.26842,0.572996,0.385162,0.372962,0.436585,0.450516,1.0,0.395078
1995,0.291532,0.161829,0.519126,0.41639,0.315733,0.45366,0.413144,1.0,0.368752
1996,0.292344,0.191482,0.750724,0.388497,0.569232,0.564015,0.421477,1.0,0.538736
1997,0.564427,0.211435,0.827512,0.646823,0.703538,0.606171,0.509344,1.0,0.695653
1998,0.533802,0.379883,0.815243,0.623982,0.591988,0.698773,0.494213,1.0,0.369264
1999,0.099033,0.425584,0.710928,0.486167,0.517061,0.631315,0.336593,1.0,0.315383


In [43]:
by_year.apply(lambda g: g['AAPL'].corr(g['MSFT'])) # 计算两个股票之间的相关系数

1990    0.408271
1991    0.266807
1992    0.450592
1993    0.236917
1994    0.361638
1995    0.258642
1996    0.147539
1997    0.196144
1998    0.364106
1999    0.329484
2000    0.275298
2001    0.563156
2002    0.571095
2003    0.486262
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

In [44]:
# 示例：面向分组的线性回归

In [45]:
def regress(data, yvar, xvars):
    y= data[yvar]
    x = data[xvars]
    x['intercept'] = 1.
    result = sm.OLS(y, x).fit()
    return result.params

by_year.apply(regress, 'AAPL', ['SPX'])

Unnamed: 0,SPX,intercept
1990,1.512772,0.001395
1991,1.187351,0.000396
1992,1.832427,0.000164
1993,1.39047,-0.002657
1994,1.190277,0.001617
1995,0.858818,-0.001423
1996,0.829389,-0.001791
1997,0.749928,-0.001901
1998,1.164582,0.004075
1999,1.384989,0.003273
