# 前言

本篇将学习 Pandas 下篇(包括数据表的获取、合并和连接、重塑和透视)

# 4. 数据表的合并和连接

+ merge函数: 键合并
+ concat函数: 轴合并

## 4.1 合并

按[键]分成[单键合并]和[多键合并]

### 单键合并

pd.merge(df1,df2,how=s,on=c)
+ c是df1,df2共有的一栏
how的四合并方式
+ left join: 合并之后显示 df1 的所有行
+ right join: 合并之后显示 df2 的所有行
+ outer join: 合并 df1,df2 共有的所有行
+ inner join: 合并所有行(默认)

In [1]:
import pandas as pd

#创建 4 天价格
df_price = pd.DataFrame({'Date':pd.date_range('2019-1-1',periods=4), 'Adj Close':[24.42,25.00,25.25,25.64]})
df_price

Unnamed: 0,Date,Adj Close
0,2019-01-01,24.42
1,2019-01-02,25.0
2,2019-01-03,25.25
3,2019-01-04,25.64


In [3]:
#创建 5 天交易量
df_volume = pd.DataFrame({'Date':pd.date_range('2019-1-2',periods=5),'Volume':[56081400,9945500,83028700,100234000,73829000]})
df_volume

Unnamed: 0,Date,Volume
0,2019-01-02,56081400
1,2019-01-03,9945500
2,2019-01-04,83028700
3,2019-01-05,100234000
4,2019-01-06,73829000


### left join

In [4]:
pd.merge(df_price, df_volume, how='left')

Unnamed: 0,Date,Adj Close,Volume
0,2019-01-01,24.42,
1,2019-01-02,25.0,56081400.0
2,2019-01-03,25.25,9945500.0
3,2019-01-04,25.64,83028700.0


### right join

In [5]:
pd.merge(df_price, df_volume, how='right')

Unnamed: 0,Date,Adj Close,Volume
0,2019-01-02,25.0,56081400
1,2019-01-03,25.25,9945500
2,2019-01-04,25.64,83028700
3,2019-01-05,,100234000
4,2019-01-06,,73829000


### outer join

In [6]:
pd.merge(df_price, df_volume, how='outer')

Unnamed: 0,Date,Adj Close,Volume
0,2019-01-01,24.42,
1,2019-01-02,25.0,56081400.0
2,2019-01-03,25.25,9945500.0
3,2019-01-04,25.64,83028700.0
4,2019-01-05,,100234000.0
5,2019-01-06,,73829000.0


### inner join

In [7]:
pd.merge(df_price, df_volume, how='inner')

Unnamed: 0,Date,Adj Close,Volume
0,2019-01-02,25.0,56081400
1,2019-01-03,25.25,9945500
2,2019-01-04,25.64,83028700


### 多键合并

In [8]:
porfolio1 = pd.DataFrame({'Asset':['FX','FX','IR'],
                         'Instrument':['Option','Swap','Option'],
                         'Number':[1,2,3]})
porfolio1

Unnamed: 0,Asset,Instrument,Number
0,FX,Option,1
1,FX,Swap,2
2,IR,Option,3


In [15]:
porfolio2 = pd.DataFrame({'Asset':['FX','FX','FX','IR'],
                        'Instrument':['Option','Option','Swap','Swap'],
                        'Number':[4,5,6,7]})
porfolio2

Unnamed: 0,Asset,Instrument,Number
0,FX,Option,4
1,FX,Option,5
2,FX,Swap,6
3,IR,Swap,7


In [16]:
# 'Asset' and 'Instrument'
pd.merge(porfolio1, porfolio2,
        on=['Asset','Instrument'],
        how='outer')

Unnamed: 0,Asset,Instrument,Number_x,Number_y
0,FX,Option,1.0,4.0
1,FX,Option,1.0,5.0
2,FX,Swap,2.0,6.0
3,IR,Option,3.0,
4,IR,Swap,,7.0


In [17]:
pd.merge(porfolio1, porfolio2,
        on='Asset')

Unnamed: 0,Asset,Instrument_x,Number_x,Instrument_y,Number_y
0,FX,Option,1,Option,4
1,FX,Option,1,Option,5
2,FX,Option,1,Swap,6
3,FX,Swap,2,Option,4
4,FX,Swap,2,Option,5
5,FX,Swap,2,Swap,6
6,IR,Option,3,Swap,7


## 4.2 连接

+ np.concat
+ pd.concat

### overlapping index

In [19]:
s1 = pd.Series([0,1], index=['a','b'])
s2 = pd.Series([2,3,4], index=['c','d','e'])
s3 = pd.Series([5,6], index=['f','g'])

In [20]:
#按 axis=0 合并
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [21]:
#axis=1 合并
pd.concat([s1,s2,s3], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


### non-overlapping index

In [23]:
s4 = pd.concat([s1,s3])
s4

a    0
b    1
f    5
g    6
dtype: int64

In [24]:
pd.concat([s1,s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


### hierachical index

In [25]:
pd.concat([s1,s1,s3],keys=['ones','two','three'])

ones   a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

### 连接 DataFrame

### axis=0 连接

In [29]:
import numpy as np

df1 = pd.DataFrame(np.arange(12).reshape(3,4),columns=['a','b','c','d'])
df1

Unnamed: 0,a,b,c,d
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [30]:
df2 = pd.DataFrame(np.arange(6).reshape(2,3),columns=['b','d','a'])
df2

Unnamed: 0,b,d,a
0,0,1,2
1,3,4,5


In [31]:
pd.concat([df1,df2])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,0,1,2.0,3
1,4,5,6.0,7
2,8,9,10.0,11
0,2,0,,1
1,5,3,,4


In [32]:
pd.concat([df1, df2], ignore_index=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d
0,0,1,2.0,3
1,4,5,6.0,7
2,8,9,10.0,11
3,2,0,,1
4,5,3,,4


### axis=1


In [34]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2),
                  index=['a','b','c'],
                  columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [36]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2),
                  index=['a','c'],
                  columns=['three','four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [37]:
pd.concat([df1,df2], axis=1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


# 5. 数据表的重塑和透视

+ 重塑: stack and unstack(互为逆转操作)
+ 透视:pivot and melt(互为逆转操作)

## 5.1 重塑

+ stack函数: 列索引 --> 行索引
+ unstack函数: 行索引 --> 列索引

### 单层 DataFrame


In [39]:
#创建 DataFrame df(1 层行索引, 1 层列索引)
symbol = ['JD','AAPL']
data = {'行业':['电商','科技'],
       '价格':[25.95,172.97],
       '交易量':[27113291, 18913154]}
df = pd.DataFrame(data, index=symbol)
df.columns.name = '特征'
df.index.name = '代号'
df

特征,行业,价格,交易量
代号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JD,电商,25.95,27113291
AAPL,科技,172.97,18913154


### stack: 列索引到行索引

In [40]:
c2i_Series = df.stack()
c2i_Series

代号    特征 
JD    行业           电商
      价格        25.95
      交易量    27113291
AAPL  行业           科技
      价格       172.97
      交易量    18913154
dtype: object

### unstack: 行索引到列索引

In [41]:
i2c_Series = df.unstack()
i2c_Series

特征   代号  
行业   JD            电商
     AAPL          科技
价格   JD         25.95
     AAPL      172.97
交易量  JD      27113291
     AAPL    18913154
dtype: object

In [42]:
df.index

Index(['JD', 'AAPL'], dtype='object', name='代号')

In [43]:
df.columns

Index(['行业', '价格', '交易量'], dtype='object', name='特征')

In [44]:
c2i_Series.index

MultiIndex(levels=[['JD', 'AAPL'], ['行业', '价格', '交易量']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
           names=['代号', '特征'])

### 基于层和名称 unstack

+ 基于层 
+ 基于名称

In [45]:
c2i_Series.unstack()#某认最后一层

特征,行业,价格,交易量
代号,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
JD,电商,25.95,27113291
AAPL,科技,172.97,18913154


In [47]:
#第一层
c2i_Series.unstack(0)

代号,JD,AAPL
特征,Unnamed: 1_level_1,Unnamed: 2_level_1
行业,电商,科技
价格,25.95,172.97
交易量,27113291,18913154


In [48]:
#基于名称
c2i_Series.unstack('代号')

代号,JD,AAPL
特征,Unnamed: 1_level_1,Unnamed: 2_level_1
行业,电商,科技
价格,25.95,172.97
交易量,27113291,18913154


### 多层 DataFrame

In [51]:
data = [['电商',101550,176.92],
       ['电商',175336,25.95],
       ['金融',60348,41.79],
       ['金融',36600,196.00]]

midx = pd.MultiIndex(levels=[['中国','美国'],
                            ['BABA','JD','GS','MS']],
                    labels=[[0,0,1,1],[0,1,2,3]],
                    names=['地区','代号'])

mcol = pd.Index(['行业','雇员','价格'], name='特征')

df = pd.DataFrame(data, index=midx, columns=mcol)
df

Unnamed: 0_level_0,特征,行业,雇员,价格
地区,代号,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
中国,BABA,电商,101550,176.92
中国,JD,电商,175336,25.95
美国,GS,金融,60348,41.79
美国,MS,金融,36600,196.0


In [57]:
df.index

MultiIndex(levels=[['中国', '美国'], ['BABA', 'JD', 'GS', 'MS']],
           labels=[[0, 1, 0, 1], [0, 1, 2, 3]],
           names=['地区', '代号'])

In [58]:
df.columns

Index(['行业', '雇员', '价格'], dtype='object', name='特征')

In [59]:
#第一层行索引到列索引
df.unstack(0)

特征,行业,行业,雇员,雇员,价格,价格
地区,中国,美国,中国,美国,中国,美国
代号,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
BABA,电商,,101550.0,,176.92,
JD,,电商,,175336.0,,25.95
GS,金融,,60348.0,,41.79,
MS,,金融,,36600.0,,196.0


In [60]:
#第二层行索引到列索引
df.unstack(1)

特征,行业,行业,行业,行业,雇员,雇员,雇员,雇员,价格,价格,价格,价格
代号,BABA,JD,GS,MS,BABA,JD,GS,MS,BABA,JD,GS,MS
地区,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
中国,电商,,金融,,101550.0,,60348.0,,176.92,,41.79,
美国,,电商,,金融,,175336.0,,36600.0,,25.95,,196.0


In [61]:
#先 unstack(0) 再 stack(0)
df.unstack(0).stack(0)

Unnamed: 0_level_0,地区,中国,美国
代号,特征,Unnamed: 2_level_1,Unnamed: 3_level_1
BABA,价格,176.92,
BABA,行业,电商,
BABA,雇员,101550,
JD,价格,,25.95
JD,行业,,电商
JD,雇员,,175336
GS,价格,41.79,
GS,行业,金融,
GS,雇员,60348,
MS,价格,,196


In [62]:
df.unstack(0).stack(1)

Unnamed: 0_level_0,特征,行业,雇员,价格
代号,地区,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BABA,中国,电商,101550.0,176.92
JD,美国,电商,175336.0,25.95
GS,中国,金融,60348.0,41.79
MS,美国,金融,36600.0,196.0


In [63]:
df.unstack(1).stack(0)

Unnamed: 0_level_0,代号,BABA,GS,JD,MS
地区,特征,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
中国,价格,176.92,41.79,,
中国,行业,电商,金融,,
中国,雇员,101550,60348,,
美国,价格,,,25.95,196
美国,行业,,,电商,金融
美国,雇员,,,175336,36600


In [64]:
df.unstack(1).stack(1)

Unnamed: 0_level_0,特征,行业,雇员,价格
地区,代号,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
中国,BABA,电商,101550.0,176.92
中国,GS,金融,60348.0,41.79
美国,JD,电商,175336.0,25.95
美国,MS,金融,36600.0,196.0


In [65]:
df.stack()

地区  代号    特征
中国  BABA  行业        电商
          雇员    101550
          价格    176.92
美国  JD    行业        电商
          雇员    175336
          价格     25.95
中国  GS    行业        金融
          雇员     60348
          价格     41.79
美国  MS    行业        金融
          雇员     36600
          价格       196
dtype: object

In [66]:
df.unstack().stack()

Unnamed: 0_level_0,特征,行业,雇员,价格
地区,代号,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
中国,BABA,电商,101550.0,176.92
中国,GS,金融,60348.0,41.79
美国,JD,电商,175336.0,25.95
美国,MS,金融,36600.0,196.0


## 5.2 透视

分组。

+ pivot: 将一张长表变成多张宽表
+ melt: 将多张宽表变成一张长表

In [67]:
data = pd.read_csv('stock data/Stock.csv', parse_dates=[0], dayfirst=True)
data

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Adj Close,Volume
0,2019-02-21,AAPL,171.800003,172.369995,170.300003,171.059998,171.059998,17249700
1,2019-02-21,JD,24.82,24.879999,24.01,24.27,24.27,13542600
2,2019-02-21,BABA,171.0,171.779999,169.800003,171.660004,171.660004,8434800
3,2019-02-21,GS,198.970001,199.449997,195.050003,196.360001,196.360001,2785900
4,2019-02-21,FB,161.929993,162.240005,159.589996,160.039993,160.039993,15607800
5,2019-02-22,AAPL,171.580002,173.0,171.380005,172.970001,172.970001,18913200
6,2019-02-22,JD,24.549999,25.959999,24.48,25.950001,25.950001,27113300
7,2019-02-22,BABA,172.800003,177.020004,172.520004,176.919998,176.919998,16175600
8,2019-02-22,GS,196.600006,197.75,195.199997,196.0,196.0,2626600
9,2019-02-22,FB,160.580002,162.410004,160.309998,161.889999,161.889999,15858500


### 从长到宽 pivot

In [68]:
close_price = data.pivot(index='Date',
                        columns='Symbol',
                        values='Adj Close')
close_price

Symbol,AAPL,BABA,FB,GS,JD
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-02-21,171.059998,171.660004,160.039993,196.360001,24.27
2019-02-22,172.970001,176.919998,161.889999,196.0,25.950001
2019-02-25,174.229996,183.25,164.619995,198.649994,26.190001
2019-02-26,174.330002,183.539993,164.130005,198.899994,26.59


In [69]:
data.pivot(index='Date',
          columns='Symbol',
          values=['Adj Close','Volume'])

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Volume,Volume,Volume,Volume,Volume
Symbol,AAPL,BABA,FB,GS,JD,AAPL,BABA,FB,GS,JD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2019-02-21,171.059998,171.660004,160.039993,196.360001,24.27,17249700.0,8434800.0,15607800.0,2785900.0,13542600.0
2019-02-22,172.970001,176.919998,161.889999,196.0,25.950001,18913200.0,16175600.0,15858500.0,2626600.0,27113300.0
2019-02-25,174.229996,183.25,164.619995,198.649994,26.190001,21873400.0,22831800.0,18737100.0,3032200.0,29338500.0
2019-02-26,174.330002,183.539993,164.130005,198.899994,26.59,17006000.0,13857900.0,13645200.0,2498000.0,20264100.0


In [70]:
all_pivot = data.pivot(index='Date', columns='Symbol')
all_pivot

Unnamed: 0_level_0,Open,Open,Open,Open,Open,High,High,High,High,High,...,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Volume,Volume,Volume,Volume,Volume
Symbol,AAPL,BABA,FB,GS,JD,AAPL,BABA,FB,GS,JD,...,AAPL,BABA,FB,GS,JD,AAPL,BABA,FB,GS,JD
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-02-21,171.800003,171.0,161.929993,198.970001,24.82,172.369995,171.779999,162.240005,199.449997,24.879999,...,171.059998,171.660004,160.039993,196.360001,24.27,17249700,8434800,15607800,2785900,13542600
2019-02-22,171.580002,172.800003,160.580002,196.600006,24.549999,173.0,177.020004,162.410004,197.75,25.959999,...,172.970001,176.919998,161.889999,196.0,25.950001,18913200,16175600,15858500,2626600,27113300
2019-02-25,174.160004,181.259995,163.070007,198.0,27.110001,175.869995,183.720001,166.070007,201.5,27.379999,...,174.229996,183.25,164.619995,198.649994,26.190001,21873400,22831800,18737100,3032200,29338500
2019-02-26,173.710007,179.789993,164.339996,198.470001,25.98,175.300003,184.350006,166.240005,200.559998,26.82,...,174.330002,183.539993,164.130005,198.899994,26.59,17006000,13857900,13645200,2498000,20264100


In [71]:
all_pivot['Open'].iloc[2:,1:3]

Symbol,BABA,FB
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-02-25,181.259995,163.070007
2019-02-26,179.789993,164.339996


### 从宽到长 melt

In [72]:
melted_data = pd.melt(data, id_vars=['Date','Symbol'])
melted_data.head(5).append(melted_data.tail(5))

Unnamed: 0,Date,Symbol,variable,value
0,2019-02-21,AAPL,Open,171.8
1,2019-02-21,JD,Open,24.82
2,2019-02-21,BABA,Open,171.0
3,2019-02-21,GS,Open,198.97
4,2019-02-21,FB,Open,161.93
115,2019-02-26,AAPL,Volume,17006000.0
116,2019-02-26,JD,Volume,20264100.0
117,2019-02-26,BABA,Volume,13857900.0
118,2019-02-26,GS,Volume,2498000.0
119,2019-02-26,FB,Volume,13645200.0


In [74]:
#筛选信息
melted_data[lambda x: (x.Date=='25/02/2019')
           & ((x.Symbol=='BABA')|(x.Symbol=='FB'))]
#调用函数

Unnamed: 0,Date,Symbol,variable,value
12,2019-02-25,BABA,Open,181.26
14,2019-02-25,FB,Open,163.07
32,2019-02-25,BABA,High,183.72
34,2019-02-25,FB,High,166.07
52,2019-02-25,BABA,Low,180.73
54,2019-02-25,FB,Low,162.9
72,2019-02-25,BABA,Close,183.25
74,2019-02-25,FB,Close,164.62
92,2019-02-25,BABA,Adj Close,183.25
94,2019-02-25,FB,Adj Close,164.62


# 6. 数据表的分组和整合

split-apply-combine

## 6.1 数据准备

In [77]:
data = pd.read_csv('stock data/1Y_Stock_Data.csv', parse_dates=[0],dayfirst=True)
data.head(3).append(data.tail(3))

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Adj Close,Volume
0,2018-02-26,AAPL,176.350006,179.389999,176.210007,178.970001,176.285675,38162200
1,2018-02-27,AAPL,179.100006,180.479996,178.160004,178.389999,175.714386,38928100
2,2018-02-28,AAPL,179.259995,180.619995,178.050003,178.119995,175.44841,37782100
1257,2019-02-22,GS,196.600006,197.75,195.199997,196.0,196.0,2626600
1258,2019-02-25,GS,198.0,201.5,197.710007,198.649994,198.649994,3032200
1259,2019-02-26,GS,198.470001,200.559998,196.550003,198.899994,198.899994,2498000


In [79]:
data1 = data[['Date','Symbol','Adj Close']]
data1.insert(1, 'Year', pd.DatetimeIndex(data1['Date']).year)
data1.insert(2, 'Month', pd.DatetimeIndex(data1['Date']).month)
data1.head(3).append(data1.tail(3))

Unnamed: 0,Date,Year,Month,Symbol,Adj Close
0,2018-02-26,2018,2,AAPL,176.285675
1,2018-02-27,2018,2,AAPL,175.714386
2,2018-02-28,2018,2,AAPL,175.44841
1257,2019-02-22,2019,2,GS,196.0
1258,2019-02-25,2019,2,GS,198.649994
1259,2019-02-26,2019,2,GS,198.899994


## 6.2 分组

data.gruopby(label)

### 单标签分组

In [80]:
#按照 gruopby 分组
grouped = data1.groupby('Symbol')
grouped

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001F4D1CD25F8>

In [82]:
dir(grouped)

['Date',
 'Month',
 'Symbol',
 'Year',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_accessors',
 '_add_numeric_operations',
 '_agg_doc',
 '_aggregate',
 '_aggregate_generic',
 '_aggregate_item_by_item',
 '_aggregate_multiple_funcs',
 '_apply_filter',
 '_apply_to_column_groupbys',
 '_apply_whitelist',
 '_assure_grouper',
 '_block_agg_axis',
 '_bool_agg',
 '_builtin_table',
 '_choose_path',
 '_concat_objects',
 '_constructor',
 '_cumcount_array',
 '_cython_agg_blocks',
 '_cython_agg_general',
 '_cython_table',
 '_cython_transform',
 '_decide_output_index',
 '_def_str',
 '_define_p

In [83]:
grouped.ngroups

5

In [84]:
grouped.size()

Symbol
AAPL    252
BABA    252
FB      252
GS      252
JD      252
dtype: int64

In [85]:
grouped.groups

{'AAPL': Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
             ...
             242, 243, 244, 245, 246, 247, 248, 249, 250, 251],
            dtype='int64', length=252),
 'BABA': Int64Index([252, 253, 254, 255, 256, 257, 258, 259, 260, 261,
             ...
             494, 495, 496, 497, 498, 499, 500, 501, 502, 503],
            dtype='int64', length=252),
 'FB': Int64Index([ 756,  757,  758,  759,  760,  761,  762,  763,  764,  765,
             ...
              998,  999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007],
            dtype='int64', length=252),
 'GS': Int64Index([1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017,
             ...
             1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259],
            dtype='int64', length=252),
 'JD': Int64Index([504, 505, 506, 507, 508, 509, 510, 511, 512, 513,
             ...
             746, 747, 748, 749, 750, 751, 752, 753, 754, 755],
            dtype='int64', length=252)}

In [86]:
#GS 组数据前 5 行
grouped.get_group('GS').head()

Unnamed: 0,Date,Year,Month,Symbol,Adj Close
1008,2018-02-26,2018,2,GS,267.574249
1009,2018-02-27,2018,2,GS,264.289459
1010,2018-02-28,2018,2,GS,260.085419
1011,2018-03-01,2018,3,GS,254.001984
1012,2018-03-02,2018,3,GS,255.327515


In [87]:
def print_groups(group_obj):
    #打印组的名字以及前 5 行信息
    for name, group in group_obj:
        print(name)
        print(group.head())

In [88]:
print_groups(grouped)

AAPL
        Date  Year  Month Symbol   Adj Close
0 2018-02-26  2018      2   AAPL  176.285675
1 2018-02-27  2018      2   AAPL  175.714386
2 2018-02-28  2018      2   AAPL  175.448410
3 2018-03-01  2018      3   AAPL  172.375214
4 2018-03-02  2018      3   AAPL  173.567078
BABA
          Date  Year  Month Symbol   Adj Close
252 2018-02-26  2018      2   BABA  194.190002
253 2018-02-27  2018      2   BABA  188.259995
254 2018-02-28  2018      2   BABA  186.139999
255 2018-03-01  2018      3   BABA  181.990005
256 2018-03-02  2018      3   BABA  179.759995
FB
          Date  Year  Month Symbol   Adj Close
756 2018-02-26  2018      2     FB  184.929993
757 2018-02-27  2018      2     FB  181.460007
758 2018-02-28  2018      2     FB  178.320007
759 2018-03-01  2018      3     FB  175.940002
760 2018-03-02  2018      3     FB  176.619995
GS
           Date  Year  Month Symbol   Adj Close
1008 2018-02-26  2018      2     GS  267.574249
1009 2018-02-27  2018      2     GS  264.289459
1010 2

In [89]:
print(grouped)

<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x000001F4D1CD25F8>


### 多标签分组

In [91]:
grouped2 = data1.groupby(['Symbol', 'Year', 'Month'])
print_groups(grouped2)

('AAPL', 2018, 2)
        Date  Year  Month Symbol   Adj Close
0 2018-02-26  2018      2   AAPL  176.285675
1 2018-02-27  2018      2   AAPL  175.714386
2 2018-02-28  2018      2   AAPL  175.448410
('AAPL', 2018, 3)
        Date  Year  Month Symbol   Adj Close
3 2018-03-01  2018      3   AAPL  172.375214
4 2018-03-02  2018      3   AAPL  173.567078
5 2018-03-05  2018      3   AAPL  174.167923
6 2018-03-06  2018      3   AAPL  174.020172
7 2018-03-07  2018      3   AAPL  172.404770
('AAPL', 2018, 4)
         Date  Year  Month Symbol   Adj Close
24 2018-04-02  2018      4   AAPL  164.180008
25 2018-04-03  2018      4   AAPL  165.864349
26 2018-04-04  2018      4   AAPL  169.036072
27 2018-04-05  2018      4   AAPL  170.208206
28 2018-04-06  2018      4   AAPL  165.854523
('AAPL', 2018, 5)
         Date  Year  Month Symbol   Adj Close
45 2018-05-01  2018      5   AAPL  166.563705
46 2018-05-02  2018      5   AAPL  173.921677
47 2018-05-03  2018      5   AAPL  174.236862
48 2018-05-04  201

In [92]:
data2 = data1.set_index(['Symbol', 'Year', 'Month'])
data2.head().append(data2.tail())

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Adj Close
Symbol,Year,Month,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,2018,2,2018-02-26,176.285675
AAPL,2018,2,2018-02-27,175.714386
AAPL,2018,2,2018-02-28,175.44841
AAPL,2018,3,2018-03-01,172.375214
AAPL,2018,3,2018-03-02,173.567078
GS,2019,2,2019-02-20,198.600006
GS,2019,2,2019-02-21,196.360001
GS,2019,2,2019-02-22,196.0
GS,2019,2,2019-02-25,198.649994
GS,2019,2,2019-02-26,198.899994


In [93]:
grouped3 = data2.groupby(level=1)
print_groups(grouped3)

2018
                        Date   Adj Close
Symbol Year Month                       
AAPL   2018 2     2018-02-26  176.285675
            2     2018-02-27  175.714386
            2     2018-02-28  175.448410
            3     2018-03-01  172.375214
            3     2018-03-02  173.567078
2019
                        Date   Adj Close
Symbol Year Month                       
AAPL   2019 1     2019-01-02  157.245605
            1     2019-01-03  141.582779
            1     2019-01-04  147.626846
            1     2019-01-07  147.298264
            1     2019-01-08  150.106216


In [96]:
grouped4 = data2.groupby(level=[0, 2])
print_groups(grouped4)

('AAPL', 1)
                        Date   Adj Close
Symbol Year Month                       
AAPL   2019 1     2019-01-02  157.245605
            1     2019-01-03  141.582779
            1     2019-01-04  147.626846
            1     2019-01-07  147.298264
            1     2019-01-08  150.106216
('AAPL', 2)
                        Date   Adj Close
Symbol Year Month                       
AAPL   2018 2     2018-02-26  176.285675
            2     2018-02-27  175.714386
            2     2018-02-28  175.448410
       2019 2     2019-02-01  165.808884
            2     2019-02-04  170.518677
('AAPL', 3)
                        Date   Adj Close
Symbol Year Month                       
AAPL   2018 3     2018-03-01  172.375214
            3     2018-03-02  173.567078
            3     2018-03-05  174.167923
            3     2018-03-06  174.020172
            3     2018-03-07  172.404770
('AAPL', 4)
                        Date   Adj Close
Symbol Year Month                       
AAPL   20

## 6.3 整合 aggregating

In [97]:
grouped.size()

Symbol
AAPL    252
BABA    252
FB      252
GS      252
JD      252
dtype: int64

In [98]:
grouped.mean()

Unnamed: 0_level_0,Year,Month,Adj Close
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,2018.150794,6.488095,186.022309
BABA,2018.150794,6.488095,171.780992
FB,2018.150794,6.488095,167.244841
GS,2018.150794,6.488095,221.593412
JD,2018.150794,6.488095,31.340754


In [99]:
result = grouped4.agg(np.mean)
result.head().append(result.tail())

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close
Symbol,Month,Unnamed: 2_level_1
AAPL,1,153.507824
AAPL,2,171.929904
AAPL,3,171.878962
AAPL,4,167.286981
AAPL,5,183.207503
JD,8,33.673478
JD,9,26.507369
JD,10,23.570435
JD,11,22.139048
JD,12,21.186842


In [101]:
result = grouped4.agg([np.mean, np.std])
result.head().append(result.tail())

Unnamed: 0_level_0,Unnamed: 1_level_0,Adj Close,Adj Close
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
Symbol,Month,Unnamed: 2_level_2,Unnamed: 3_level_2
AAPL,1,153.507824,5.318589
AAPL,2,171.929904,2.54405
AAPL,3,171.878962,4.734514
AAPL,4,167.286981,4.921071
AAPL,5,183.207503,5.115191
JD,8,33.673478,2.014952
JD,9,26.507369,1.185467
JD,10,23.570435,0.984542
JD,11,22.139048,1.690337
JD,12,21.186842,0.764831


In [102]:
result = grouped.agg(lambda x: np.max(x) - np.min(x))
result.head().append(result.tail())

Unnamed: 0_level_0,Date,Year,Month,Adj Close
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AAPL,365 days,1,11,88.692703
BABA,365 days,1,11,80.259995
FB,365 days,1,11,93.440002
GS,365 days,1,11,114.072418
JD,365 days,1,11,29.529999
AAPL,365 days,1,11,88.692703
BABA,365 days,1,11,80.259995
FB,365 days,1,11,93.440002
GS,365 days,1,11,114.072418
JD,365 days,1,11,29.529999


## 6.4 split-apply-combine

In [105]:
def top(df, n=5, column='Volume'):
    return df.sort_values(by=column)[-n:]

In [106]:
top(data)

Unnamed: 0,Date,Symbol,Open,High,Low,Close,Adj Close,Volume
145,2018-09-21,AAPL,220.779999,221.360001,217.289993,217.660004,215.976913,96246700
773,2018-03-21,FB,164.800003,173.399994,163.300003,169.389999,169.389999,105920200
776,2018-03-26,FB,160.820007,161.100006,149.020004,160.059998,160.059998,126116600
772,2018-03-20,FB,167.470001,170.199997,161.949997,168.149994,168.149994,129851800
861,2018-07-26,FB,174.889999,180.130005,173.75,176.259995,176.259995,169803700


### apply 函数

In [107]:
data.groupby('Symbol').apply(top) #apply 高阶函数

Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Symbol,Open,High,Low,Close,Adj Close,Volume
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL,109,2018-08-01,AAPL,199.130005,201.759995,197.309998,201.5,199.243088,67935700
AAPL,215,2019-01-03,AAPL,143.979996,145.720001,142.0,142.190002,141.582779,91312200
AAPL,175,2018-11-02,AAPL,209.550003,213.649994,205.429993,207.479996,205.87561,91328700
AAPL,208,2018-12-21,AAPL,156.860001,158.160004,149.630005,150.729996,150.086304,95744600
AAPL,145,2018-09-21,AAPL,220.779999,221.360001,217.289993,217.660004,215.976913,96246700
BABA,427,2018-11-02,BABA,152.559998,154.360001,146.279999,147.589996,147.589996,45985800
BABA,426,2018-11-01,BABA,144.979996,152.317993,138.619995,151.25,151.25,47039300
BABA,410,2018-10-10,BABA,142.5,144.0,137.919998,138.289993,138.289993,55828800
BABA,300,2018-05-04,BABA,180.399994,190.600006,178.619995,188.889999,188.889999,57788300
BABA,377,2018-08-23,BABA,184.970001,186.5,171.910004,172.229996,172.229996,78843400


In [108]:
data1.groupby(['Symbol','Year']).apply(top, n=1, column='Adj Close')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Date,Year,Month,Symbol,Adj Close
Symbol,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAPL,2018,153,2018-10-03,2018,10,AAPL,230.275482
AAPL,2019,251,2019-02-26,2019,2,AAPL,174.330002
BABA,2018,328,2018-06-14,2018,6,BABA,210.860001
BABA,2019,503,2019-02-26,2019,2,BABA,183.539993
FB,2018,860,2018-07-25,2018,7,FB,217.5
FB,2019,993,2019-02-05,2019,2,FB,171.160004
GS,2018,1018,2018-03-12,2018,3,GS,270.422424
GS,2019,1234,2019-01-18,2019,1,GS,202.539993
JD,2018,504,2018-02-26,2018,2,JD,48.799999
JD,2019,755,2019-02-26,2019,2,JD,26.59
