In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

  data_klasses = (pandas.Series, pandas.DataFrame, pandas.Panel)


## 示例1：使用指定分组值填充缺失值

In [3]:
s = pd.Series(np.random.rand(6))
s[::2] = np.nan
s

0         NaN
1    0.321692
2         NaN
3    0.179424
4         NaN
5    0.762851
dtype: float64

In [4]:
s.mean()

0.4213223001833382

In [5]:
s.fillna(s.mean())

0    0.421322
1    0.321692
2    0.421322
3    0.179424
4    0.421322
5    0.762851
dtype: float64

In [9]:
group_key

['East', 'East', 'East', 'East', 'West', 'West', 'West', 'West']

In [6]:
states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key = ['East'] * 4 + ['West'] * 4
data = pd.Series(np.random.randn(8), index=states)
data

Ohio          0.968576
New York      1.804447
Vermont       0.408057
Florida       2.338827
Oregon        0.967104
Nevada        1.205268
California    1.019146
Idaho         1.587301
dtype: float64

In [7]:
data[['Vermont', 'Nevada', 'Idaho']] = np.nan
data
# data.groupby(group_key).mean()

Ohio          0.968576
New York      1.804447
Vermont            NaN
Florida       2.338827
Oregon        0.967104
Nevada             NaN
California    1.019146
Idaho              NaN
dtype: float64

In [11]:
for name, ser in data.groupby(group_key):
    print(name)
    print(ser.fillna(ser.mean()))

East
Ohio        0.968576
New York    1.804447
Vermont     1.703950
Florida     2.338827
dtype: float64
West
Oregon        0.967104
Nevada        0.993125
California    1.019146
Idaho         0.993125
dtype: float64


In [12]:
fill_mean = lambda ser: ser.fillna(ser.mean())
data.groupby(group_key).apply(fill_mean)

Ohio          0.968576
New York      1.804447
Vermont       1.703950
Florida       2.338827
Oregon        0.967104
Nevada        0.993125
California    1.019146
Idaho         0.993125
dtype: float64

## 示例2：分组加权平均和相关性

In [13]:
df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',
                                'b', 'b', 'b', 'b'],
                   'data': np.random.randn(8),
                   'weights': np.random.rand(8)})
df

Unnamed: 0,category,data,weights
0,a,-0.538105,0.384966
1,a,-1.286816,0.691222
2,a,-0.685585,0.290874
3,a,0.360765,0.623987
4,b,0.461005,0.227476
5,b,-0.256503,0.763004
6,b,0.773426,0.188341
7,b,0.635781,0.428336


In [16]:
x = df.loc[df['category']=='a', :]
lambda x: np.average(x['data'], weights=x['weights'])

-0.53787426791093

In [17]:
grouped = df.groupby('category')
get_wavg = lambda x: np.average(x['data'], weights=x['weights'])
grouped.apply(get_wavg)

category
a   -0.537874
b    0.203559
dtype: float64

In [2]:
close_px = pd.read_csv('../datas/stock_px_2.csv', parse_dates=True,
                       index_col=0)
# close_px.info()
close_px

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,7.40,21.11,29.22,909.03
2003-01-03,7.45,21.14,29.24,908.59
2003-01-06,7.45,21.52,29.96,929.01
2003-01-07,7.43,21.93,28.95,922.93
2003-01-08,7.28,21.31,28.83,909.93
...,...,...,...,...
2011-10-10,388.81,26.94,76.28,1194.89
2011-10-11,400.29,27.00,76.27,1195.54
2011-10-12,402.19,26.96,77.16,1207.25
2011-10-13,408.43,27.18,76.37,1203.66


In [3]:
rtn = close_px.pct_change()

In [4]:
# 按照年份进行分组
grouped = rtn.groupby(close_px.index.year)

In [5]:
x = rtn.loc['2003', ]
spx_corr = lambda x: x.corrwith(x['SPX'])

In [6]:
grouped.apply(spx_corr)

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003,0.541124,0.745174,0.661265,1.0
2004,0.374283,0.588531,0.557742,1.0
2005,0.46754,0.562374,0.63101,1.0
2006,0.428267,0.406126,0.518514,1.0
2007,0.508118,0.65877,0.786264,1.0
2008,0.681434,0.804626,0.828303,1.0
2009,0.707103,0.654902,0.797921,1.0
2010,0.710105,0.730118,0.839057,1.0
2011,0.691931,0.800996,0.859975,1.0


In [7]:
grouped.apply(lambda g: g['AAPL'].corr(g['MSFT']))

2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
dtype: float64

## 示例3：逐组线性回归

In [13]:
def regress(data, yvar, xvars):
    data = data.dropna()  
#     # 若设置成inplace=True会导致后续分组后的data有问题
#     data.dropna(inplace=True) 
#     # 可通过print(data)观察二者的影响
#     print(data)
    Y = data[yvar]
    X = data[xvars]  # X是一个DataFrame
    X['intercept'] = 1.
    result = sm.OLS(Y, X).fit()
    return result.params

In [40]:
x = rtn.loc['2003', ]
x

Unnamed: 0,AAPL,MSFT,XOM,SPX
2003-01-02,,,,
2003-01-03,0.006757,0.001421,0.000684,-0.000484
2003-01-06,0.000000,0.017975,0.024624,0.022474
2003-01-07,-0.002685,0.019052,-0.033712,-0.006545
2003-01-08,-0.020188,-0.028272,-0.004145,-0.014086
...,...,...,...,...
2003-12-24,0.030303,-0.003717,0.002080,-0.001807
2003-12-26,0.018627,0.006063,0.005336,0.001691
2003-12-29,0.017324,0.009272,0.013270,0.012401
2003-12-30,0.006623,0.002297,0.002619,0.000144


In [14]:
grouped.apply(regress, yvar='AAPL', xvars=['SPX'])

                AAPL      MSFT       XOM       SPX
2003-01-03  0.006757  0.001421  0.000684 -0.000484
2003-01-06  0.000000  0.017975  0.024624  0.022474
2003-01-07 -0.002685  0.019052 -0.033712 -0.006545
2003-01-08 -0.020188 -0.028272 -0.004145 -0.014086
2003-01-09  0.008242  0.029094  0.021159  0.019386
...              ...       ...       ...       ...
2003-12-24  0.030303 -0.003717  0.002080 -0.001807
2003-12-26  0.018627  0.006063  0.005336  0.001691
2003-12-29  0.017324  0.009272  0.013270  0.012401
2003-12-30  0.006623  0.002297  0.002619  0.000144
2003-12-31  0.004699 -0.005500  0.007837  0.002055

[251 rows x 4 columns]
                AAPL      MSFT       XOM       SPX
2004-01-02 -0.004677  0.002765 -0.008929 -0.003094
2004-01-05  0.042293  0.025276  0.023249  0.012395
2004-01-06 -0.003607  0.003586 -0.006816  0.001292
2004-01-07  0.022624 -0.001340 -0.007149  0.002367
2004-01-08  0.033628 -0.001342 -0.002592  0.004963
...              ...       ...       ...       ...
2004-12

Unnamed: 0,SPX,intercept
2003,1.195406,0.00071
2004,1.363463,0.004201
2005,1.766415,0.003246
2006,1.645496,8e-05
2007,1.198761,0.003438
2008,0.968016,-0.00111
2009,0.879103,0.002954
2010,1.052608,0.001261
2011,0.806605,0.001514
