# 8 数据规整：连接、联合与重塑
重点关注数据联合、连接以及重排

## 8.1 分层索引

In [1]:
import pandas as pd
import numpy as np

In [2]:
# 双层索引的Series
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data

a  1   -0.313520
   2    0.897684
   3   -0.165706
b  1   -1.827779
   3   -0.075693
c  1    0.734299
   2   -1.341649
d  2    0.659254
   3   -1.119590
dtype: float64

In [3]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [4]:
# 使用分层索引可以简洁地选择出数据的子集
data['b']

1   -1.827779
3   -0.075693
dtype: float64

In [5]:
data['b':'c']

b  1   -1.827779
   3   -0.075693
c  1    0.734299
   2   -1.341649
dtype: float64

In [6]:
data.loc[['b', 'd']]

b  1   -1.827779
   3   -0.075693
d  2    0.659254
   3   -1.119590
dtype: float64

In [7]:
# 内部层级中也可以选择，比如选择内层中索引为2的
data.loc[:, 2]

a    0.897684
c   -1.341649
d    0.659254
dtype: float64

In [8]:
# 重塑数据
data.unstack()

Unnamed: 0,1,2,3
a,-0.31352,0.897684,-0.165706
b,-1.827779,,-0.075693
c,0.734299,-1.341649,
d,,0.659254,-1.11959


In [9]:
# 在DataFrame中，每个轴都可以有分层索引
df = pd.DataFrame(np.arange(12).reshape(4, 3),
                  index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                  columns=[['Ohio', 'Ohio', 'California'], ['Green', 'Red', 'Green']])
df.index.names = ['key1', 'key2']
df.columns.names = ['State', 'Color']
df

Unnamed: 0_level_0,State,Ohio,Ohio,California
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [10]:
# 筛选出数据
df['Ohio']

Unnamed: 0_level_0,Color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [11]:
df.loc['a']

State,Ohio,Ohio,California
Color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,0,1,2
2,3,4,5


In [12]:
from pandas import MultiIndex

columns = MultiIndex.from_arrays([['Ohio', 'Ohio', 'California'], ['Green', 'Red', 'Green']],
                                 names=['State', 'Color'])
indexes = MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                                 names=['key1', 'key2'])

In [13]:
df2 = pd.DataFrame(np.arange(12).reshape(4, 3),
                   columns=columns,
                   index=indexes)

In [14]:
df2

Unnamed: 0_level_0,State,Ohio,Ohio,California
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


### 8.1.1 重排序和层级排序

In [15]:
# 重新排列轴上的层级顺序,
# swaplevel接收两个层级序号或层级名称，返回一个层级变更的新对象
df2.swaplevel('key1', 'key2')

Unnamed: 0_level_0,State,Ohio,Ohio,California
Unnamed: 0_level_1,Color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [16]:
# level=0表示第一层(key1)，level=1表示第二层(key2)
# 如果索引按照字典顺序从最外层开始排序，那么数据选择性能会更好
df2.sort_index(level=0, ascending=False)

Unnamed: 0_level_0,State,Ohio,Ohio,California
Unnamed: 0_level_1,Color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
b,2,9,10,11
b,1,6,7,8
a,2,3,4,5
a,1,0,1,2


### 8.1.2 按层级进行汇总统计
Series和DataFrame上很多描述性和统计行统计有一个level选项，通过level可以指定要在某个特定的轴上进行聚合

In [17]:
df2.groupby(level='key1').sum()

State,Ohio,Ohio,California
Color,Green,Red,Green
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a,3,5,7
b,15,17,19


In [18]:
df2.groupby(level='Color', axis=1).sum()

Unnamed: 0_level_0,Color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### 8.1.3 使用DataFrame的列进行索引

In [19]:
df3 = pd.DataFrame({'a': range(7),
                    'b': range(7, 0, -1),
                    'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                    'd': [0, 1, 2, 0, 1, 2, 3]})
df3

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [20]:
df3.set_index(['c', 'd'])

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [21]:
# 默认情况下，设置为index的列会从DataFrame中移除，也可以保留
df3.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [22]:
# reset_index是set_index的反向操作
df4 = df3.set_index(['c', 'd'])
df4.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


## 8.2 联合与合并数据集

In [23]:
# 数据库风格的DataFrame连接
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [24]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [25]:
# 使用pd.merge连接
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


In [26]:
# 如果列名不同，可以手动指定连接列
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,b,6,b,1
3,a,2,a,0
4,a,4,a,0
5,a,5,a,0


In [27]:
# pd.merge默认情况下是内连接，取交集
# 可以指定连接方式 outer为外连接，取并集
pd.merge(df3, df4, how='outer', left_on='lkey', right_on='rkey')

Unnamed: 0,lkey,data1,rkey,data2
0,b,0.0,b,1.0
1,b,1.0,b,1.0
2,b,6.0,b,1.0
3,a,2.0,a,0.0
4,a,4.0,a,0.0
5,a,5.0,a,0.0
6,c,3.0,,
7,,,d,2.0


In [28]:
# 使用多键合并时，传入一个列名的列表
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'lval': [4, 5, 6, 7]})
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval_x,lval_y
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [29]:
# 处理重叠的列名，使用suffixes参数
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval_left,key2_right,lval_right
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### 8.2.2 根据索引合并

In [30]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]},
                      index=['a', 'b'])
# left_index=True或者right_index=True表示需要用来作为合并的键
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [31]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [32]:
# 在多层索引的情况下，事情会变的更复杂，在索引上的连接是一个隐式的多键合并
df1 = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                    'key2': [2000, 2001, 2002, 2001, 2002],
                    'data': np.arange(5.)})
df2 = pd.DataFrame(np.arange(12).reshape(6, 2),
                   index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                          [2001, 2000, 2000, 2000, 2001, 2002]])
df1

Unnamed: 0,key1,key2,data
0,Ohio,2000,0.0
1,Ohio,2001,1.0
2,Ohio,2002,2.0
3,Nevada,2001,3.0
4,Nevada,2002,4.0


In [33]:
df2

Unnamed: 0,Unnamed: 1,0,1
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [36]:
# 对于多重索引合并的情况，需要以列表的形式指明合并所需的多个列
pd.merge(df1, df2, left_on=['key1', 'key2'], right_index=True, how='outer')

Unnamed: 0,key1,key2,data,0,1
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
4,Nevada,2000,,2.0,3.0


In [37]:
# 使用两边的索引合并也是可以的
df3 = pd.DataFrame([[1, 2], [3, 4], [5, 6]],
                   index=['a', 'c', 'e'],
                   columns=['Ohio', 'Nevada'])
df4 = pd.DataFrame([[7, 8], [9, 10], [11, 12], [13, 14]],
                   index=['b', 'c', 'd', 'e'],
                   columns=['Missouri', 'Alabama'])
df3

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [38]:
df4

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [39]:
pd.merge(df3, df4, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [40]:
# 也可以使用join方法
df3.join(df4, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [41]:
df3.join(df4, how='left')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1,2,,
c,3,4,9.0,10.0
e,5,6,13.0,14.0


In [42]:
another = pd.DataFrame([[7, 8], [9, 10], [11, 12], [16, 17]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])
another

Unnamed: 0,New York,Oregon
a,7,8
c,9,10
e,11,12
f,16,17


In [43]:
df3.join([df4, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [45]:
df3.join([df4, another], how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


In [55]:
pd.merge(df3, df4, left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
c,3,4,9,10
e,5,6,13,14


### 8.2.3 沿轴向连接

In [56]:
# 拼接、绑定或堆叠
arr = np.arange(12).reshape(4, 3)
arr

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [60]:
# Numpy的拼接方法接收一个数组组成的列表
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  0,  1,  2],
       [ 3,  4,  5,  3,  4,  5],
       [ 6,  7,  8,  6,  7,  8],
       [ 9, 10, 11,  9, 10, 11]])

In [61]:
# pandas的concat函数
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 7], index=['f', 'g'])
pd.concat([s1, s2, s3])  # 传入的对象是可迭代的，默认在0轴上进行连接

a    0
b    1
c    2
d    3
e    4
f    5
g    7
dtype: int64

In [63]:
pd.concat((s1, s2, s3), axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,7.0


In [64]:
# 假如想在连接上创建多层索引
# 添加的外层轴索引是和s1,s2,s3相关联的
pd.concat([s1, s2, s3], keys=['one', 'two', 'three'])

one    a    0
       b    1
two    c    2
       d    3
       e    4
three  f    5
       g    7
dtype: int64

In [65]:
# 当沿着axis=1上添加时，就成了DataFrame的列头
pd.concat([s1, s2, s3], keys=['one', 'two', 'three'], axis=1)

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,7.0


In [68]:
# 相同的逻辑使用在DataFrame上
df1 = pd.DataFrame(np.arange(6).reshape(3, 2),
                   index=list('abc'),
                   columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2),
                   index=list('ac'),
                   columns=['three', 'four'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [69]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [73]:
pd.concat([df1, df2], axis=1, keys=['l1', 'l2'])

Unnamed: 0_level_0,l1,l1,l2,l2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [74]:
# 如果传递的对象是字典而不是列表，则字典的键会用于keys选项
pd.concat({'l1': df1, 'l2': df2}, axis=1)

Unnamed: 0_level_0,l1,l1,l2,l2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [75]:
# 还有一些额外的参数负责多层索引生成
pd.concat([df1, df2], axis=1, keys=['l1', 'l2'], names=['upper', 'lower'])

upper,l1,l1,l2,l2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [76]:
# 最后需要考虑的是行索引中不包含任何相关数据的DataFrame
df1 = pd.DataFrame(np.random.randn(3, 4),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.random.randn(2, 3),
                   columns=list('bda'))
pd.concat([df1, df2], ignore_index=True)  # 忽略掉index，就是直接合并

Unnamed: 0,a,b,c,d
0,-0.603532,0.13916,0.606285,0.412658
1,-0.338079,0.54586,0.377498,-2.335955
2,-0.172943,-0.585915,-0.624138,1.110642
3,0.575167,0.532482,,0.452961
4,0.07925,-0.824,,-0.037771


### 8.2.4 联合重叠数据

In [79]:
a = pd.Series([np.nan, 2.5, 0, 3.5, 4.5, np.nan],
              index=list('fedcba'))
b = pd.Series([0, np.nan, 2, np.nan, np.nan, 5],
              index=list('abcdef'))
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [80]:
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [81]:
# Series的combine_first函数就是去两者索引相同时出现的第一个值
b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64