数据规整：连接、联合和重塑

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_note_interactivity = "all"
%matplotlib inline

import os
import sys
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

1. 分层索引

————重排序及层级排序

————按层级进行汇总统计

————使用DF的列进行索引

2. 联合与合并数据集

————数据库风格的DF连接

————根据索引合并

————沿轴向连接

————联合重叠数据

3. 重塑和透视

————使用多层索引进行重塑

————将“长”透视为“宽”

————将“宽”透视为“长”

1. 分层索引

In [3]:
#分层索引提供了一种在耕地维度的形式中处理更高维度数据的方法。
data = pd.Series(np.random.randn(9),
                 index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],
                        [1, 2, 3, 1, 3, 1, 2, 2, 3]])

In [5]:
data

a  1    1.456137
   2    0.204480
   3    0.913091
b  1    0.460049
   3   -1.326830
c  1    0.398192
   2    0.206917
d  2    0.126377
   3   -1.134895
dtype: float64

In [6]:
data.index

MultiIndex(levels=[['a', 'b', 'c', 'd'], [1, 2, 3]],
           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]])

In [7]:
data['b']

1    0.460049
3   -1.326830
dtype: float64

In [8]:
data['b':'c']

b  1    0.460049
   3   -1.326830
c  1    0.398192
   2    0.206917
dtype: float64

In [9]:
data.loc[['b','d']]

b  1    0.460049
   3   -1.326830
d  2    0.126377
   3   -1.134895
dtype: float64

In [10]:
data.loc[:, 2]

a    0.204480
c    0.206917
d    0.126377
dtype: float64

In [15]:
data.loc[:,1]

a    1.456137
b    0.460049
c    0.398192
dtype: float64

In [16]:
data

a  1    1.456137
   2    0.204480
   3    0.913091
b  1    0.460049
   3   -1.326830
c  1    0.398192
   2    0.206917
d  2    0.126377
   3   -1.134895
dtype: float64

In [17]:
#使用unstack方法将数据在DF中进行重新排列

In [19]:
data.unstack()

Unnamed: 0,1,2,3
a,1.456137,0.20448,0.913091
b,0.460049,,-1.32683
c,0.398192,0.206917,
d,,0.126377,-1.134895


In [20]:
data.unstack().stack()

a  1    1.456137
   2    0.204480
   3    0.913091
b  1    0.460049
   3   -1.326830
c  1    0.398192
   2    0.206917
d  2    0.126377
   3   -1.134895
dtype: float64

In [22]:
#在DF中每一个轴都可以拥有分层索引

frame = pd.DataFrame(np.arange(12).reshape(4, 3),
                     index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
                            columns=[['Ohio', 'Ohio', 'Colorado'],
                                     ['Green', 'Red', 'Green']])

In [23]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [24]:
#分层的层级可以有名字

frame.index.names = ['key1', 'key2']
frame.columns.names = ['state', 'color']
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [25]:
frame['Ohio']

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [26]:
frame['Ohio']['Green']

key1  key2
a     1       0
      2       3
b     1       6
      2       9
Name: Green, dtype: int32

In [29]:
frame['Ohio']['Green']['a']

key2
1    0
2    3
Name: Green, dtype: int32

In [30]:
frame['Ohio']['Green'][:,1]

key1
a    0
b    6
Name: Green, dtype: int32

In [35]:
#自行构建MultiIndex对象
pd.MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],
                       names=['state', 'color'])

MultiIndex(levels=[['Colorado', 'Ohio'], ['Green', 'Red']],
           labels=[[1, 1, 0], [0, 1, 0]],
           names=['state', 'color'])

重排序和层级排序

In [38]:
#swaplevel接收层级序号或者名称
frame.swaplevel('key1', 'key2')

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [37]:
frame.swaplevel(0, 1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [39]:
#sort_index被用于层级排序，但只能在单一层级上的数据进行排序。
frame.sort_index(level=1) #level=1指key2索引

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [40]:
frame.sort_index(level=0) #level=0指key1索引

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [41]:
frame.swaplevel(0, 1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


按照层级进行汇总统计

In [42]:
frame.sum(level='key2')

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [44]:
frame.sum(level='color', axis=1) #横向的汇总

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


使用DF的列进行索引

In [48]:
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
                      'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],
                      'd': [0, 1, 2, 0, 1, 2, 3]})

In [47]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,onw,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [49]:
#使用set_index函数生成一个新的DF，并指定一个或多个列为列索引

In [50]:
frame2 = frame.set_index(['c', 'd'])
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [51]:
frame.set_index(['c', 'd'], drop=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [52]:
#reset_index为set_index的反操作
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


联合与合并数据集

类似于数据库连接操作——————pandas.merge

数据表轴向堆叠——————pandas.concat

将两个实例数据拼接，用其中一个对象的值填充另一个对象的缺失值——————pandas.combine_first

In [53]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})

In [54]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                    'data2': range(3)})

In [55]:
df1

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b


In [56]:
df2

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,d


In [57]:
pd.merge(df1, df2)

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [58]:
#尽量显式的指定连接键
pd.merge(df1, df2, on='key')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [59]:
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                    'data1': range(7)})

In [60]:
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                    'data2': range(3)})

In [61]:
pd.merge(df3, df4, left_on='lkey', right_on='rkey')

Unnamed: 0,data1,lkey,data2,rkey
0,0,b,1,b
1,1,b,1,b
2,6,b,1,b
3,2,a,0,a
4,4,a,0,a
5,5,a,0,a


In [65]:
# 左连接left，右连接right，外链接outer,默认是内连接inner

pd.merge(df1, df2, how='left')

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,1,b,1.0
2,2,a,0.0
3,3,c,
4,4,a,0.0
5,5,a,0.0
6,6,b,1.0


In [66]:
pd.merge(df1, df2, how='inner')

Unnamed: 0,data1,key,data2
0,0,b,1
1,1,b,1
2,6,b,1
3,2,a,0
4,4,a,0
5,5,a,0


In [67]:
#合理地处理重叠列名，suffixes参数
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                     'key2': ['one', 'two', 'one'],
                     'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                      'key2': ['one', 'one', 'one', 'two'],
                      'rval': [4, 5, 6, 7]})

In [68]:
pd.merge(left, right, on=['key1', 'key2'], how='outer')

Unnamed: 0,key1,key2,lval,rval
0,foo,one,1.0,4.0
1,foo,one,1.0,5.0
2,foo,two,2.0,
3,bar,one,3.0,6.0
4,bar,two,,7.0


In [69]:
pd.merge(left, right, on='key1')

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [70]:
pd.merge(left, right, on='key1', suffixes=('_left', '_right'))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


merge函数的参数：

left，right，how，on，left_on，right_on，left_index（使用索引作为连接键），right_index，sort，suffixes，copy，indicator

In [71]:
#根据索引合并

In [74]:
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                      'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])

In [75]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [76]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [77]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [78]:
pd.merge(left1, right1, left_on='key', right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0


In [79]:
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [82]:
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                      'key2': [2000, 2001, 2002, 2001, 2002],
                      'data': np.arange(5.)})
righth = pd.DataFrame(np.arange(12).reshape(6, 2),
                      index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                            [2001, 2000, 2000, 2000, 2001, 2002]],
                      columns=['event1', 'event2'])

In [83]:
lefth

Unnamed: 0,data,key1,key2
0,0.0,Ohio,2000
1,1.0,Ohio,2001
2,2.0,Ohio,2002
3,3.0,Nevada,2001
4,4.0,Nevada,2002


In [84]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [85]:
pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4,5
0,0.0,Ohio,2000,6,7
1,1.0,Ohio,2001,8,9
2,2.0,Ohio,2002,10,11
3,3.0,Nevada,2001,0,1


In [86]:
pd.merge(lefth, righth, left_on=['key1', 'key2'],
         right_index=True, how='outer')

Unnamed: 0,data,key1,key2,event1,event2
0,0.0,Ohio,2000,4.0,5.0
0,0.0,Ohio,2000,6.0,7.0
1,1.0,Ohio,2001,8.0,9.0
2,2.0,Ohio,2002,10.0,11.0
3,3.0,Nevada,2001,0.0,1.0
4,4.0,Nevada,2002,,
4,,Nevada,2000,2.0,3.0


In [87]:
#使用两边的索引进行合并也是ok的

left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
                     index=['a', 'c', 'e'],
                     columns=['Ohio', 'Nevada'])

In [88]:
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
                      index=['b', 'c', 'd', 'e'],
                      columns=['Missouri', 'Alabama'])

In [89]:
left2

Unnamed: 0,Ohio,Nevada
a,1.0,2.0
c,3.0,4.0
e,5.0,6.0


In [90]:
right2

Unnamed: 0,Missouri,Alabama
b,7.0,8.0
c,9.0,10.0
d,11.0,12.0
e,13.0,14.0


In [91]:
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


join实例方法用起来更为方便，用于按照索引合并。也可以用于合并多个索引相同或相似但没有重叠列的DF对象。

字符串操作也有join函数，别混淆了

In [92]:
left2.join(right2, how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [96]:
#复习一下str.join([])
str1='asdf'
str2='asdfasfd'
'|||'.join([str1,str2])

'asdf|||asdfasfd'

In [95]:
left1.join(right1, on='key')

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [97]:
#join的另一个应用是，对于一些简单的索引，用户可以向join方法传入一个DF列表，该方法可以简单替代concat用于表的堆叠

In [98]:
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
                       index=['a', 'c', 'e', 'f'],
                       columns=['New York', 'Oregon'])

In [99]:
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [100]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0


In [101]:
left2.join([right2, another], how='outer')

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
b,,,7.0,8.0,,
c,3.0,4.0,9.0,10.0,9.0,10.0
d,,,11.0,12.0,,
e,5.0,6.0,13.0,14.0,11.0,12.0
f,,,,,16.0,17.0


沿轴向连接

或者称为拼接、绑定和堆叠，NumPy的concatenate实现的是数组运算，concat实现的是DF表和Series的运算。

In [106]:
arr = np.arange(12).reshape(3, 4)
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [107]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [105]:
np.concatenate([arr, arr])

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [108]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

In [112]:
pd.concat([s1, s2, s3]) #concat默认是进行纵向拼接

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [113]:
pd.concat([s1, s2, s3]
          , axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [114]:
s4 = pd.concat([s1, s3])

In [115]:
s4

a    0
b    1
f    5
g    6
dtype: int64

In [116]:
pd.concat([s1, s4], axis =1)

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [117]:
pd.concat([s1, s4], axis=1, join='inner')

Unnamed: 0,0,1
a,0,0
b,1,1


In [121]:
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'e']]) 

Unnamed: 0,0,1
a,0.0,0.0
c,,
e,,


In [129]:
#用keys来区分拼接后的结果，辨别原始数据属于哪一部分

result = pd.concat([s1, s1, s3], keys = ['one', 'two', 'three'])
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: int64

In [161]:
res_df = result.unstack()

print(res_df.shape)
res_df

(3, 4)


C,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [136]:
help(result.swapaxes)

Help on method swapaxes in module pandas.core.generic:

swapaxes(axis1, axis2, copy=True) method of pandas.core.series.Series instance
    Interchange axes and swap values axes appropriately
    
    Returns
    -------
    y : same as input



In [149]:
res_df.columns.name = 'C'
res_df.index.name = 'I'

In [150]:
res_df

C,a,b,f,g
I,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [158]:
res_df.swapaxes('I', 'C') #???????????

ValueError: No axis named I for object type <class 'pandas.core.frame.DataFrame'>

In [163]:
#跳回到result
#横向堆积时，keys中的参数会变成列名
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [165]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                   columns=['one', 'two'])
df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                   columns=['three', 'four'])

In [166]:
#同理concat使用与DF
pd.concat([df1, df2], keys = ['first', 'second'], axis=1)

Unnamed: 0_level_0,first,first,second,second
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [167]:
pd.concat({'level1': df1, 'level2': df2}, axis=1)

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [169]:
#用names来生成轴层级
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
          names=['upper', 'lower'])
#相当于直接初始化了df.index.names和df.columns.names

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [170]:
#如果DF表中没被设置index怎么办

df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])

In [171]:
df1

Unnamed: 0,a,b,c,d
0,0.956231,-1.230964,-0.115247,1.238308
1,0.951895,0.355957,0.016743,0.399602
2,0.967731,-0.663155,0.360556,0.860356


In [172]:
df2

Unnamed: 0,b,d,a
0,-0.31652,1.079503,-0.696327
1,-0.185494,-0.464592,1.623828


In [173]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,a,b,c,d
0,0.956231,-1.230964,-0.115247,1.238308
1,0.951895,0.355957,0.016743,0.399602
2,0.967731,-0.663155,0.360556,0.860356
3,-0.696327,-0.31652,,1.079503
4,1.623828,-0.185494,,-0.464592


In [175]:
pd.concat([df1, df2], ignore_index=True, axis=1)#列轴居然会自动去重，形成新的数字序列标定列名，funny

Unnamed: 0,0,1,2,3,4,5,6
0,0.956231,-1.230964,-0.115247,1.238308,-0.31652,1.079503,-0.696327
1,0.951895,0.355957,0.016743,0.399602,-0.185494,-0.464592,1.623828
2,0.967731,-0.663155,0.360556,0.860356,,,


In [177]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,a,b,c,d,b.1,d.1,a.1
0,0.956231,-1.230964,-0.115247,1.238308,-0.31652,1.079503,-0.696327
1,0.951895,0.355957,0.016743,0.399602,-0.185494,-0.464592,1.623828
2,0.967731,-0.663155,0.360556,0.860356,,,


总结一下concat函数的参数

objs——需要连接的pandas对象列表或字典，必选

axis——连接轴，默认0为行方向

join——inner、outer

join_axes——用于指定其他n-1轴的特定索引，可以替代内/外连接的逻辑

keys——与要连接的对象关联的值，沿着连接轴形成分层索引

levels——在键值传递时，该参数用于指定多层索引的层级

names——如果传入了keys/levels参数，该参数用于多层索引的层级名称

verify_integrity——检查对象中的新轴是否重复，如果是，则引发异常，默认是False

ignore_index——不沿着连接轴保留索引，而产生一段新的（长度为total_length）的索引

联合重叠数据

In [178]:
a = pd.Series([np.nan, 2.5, 0., 3.5, 4.5, np.nan],
              index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
              index=['a', 'b', 'c', 'd', 'e', 'f'])

In [179]:
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [180]:
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [181]:
np.where(pd.isnull(a), b, a)

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

In [182]:
#Series有一个combine_first函数与之很像

b.combine_first(a)

a    0.0
b    4.5
c    2.0
d    0.0
e    2.5
f    5.0
dtype: float64

In [184]:
#在DF中，combine_first函数会逐个列的做出相同的操作，可以理解为根据传入的数据来“填补”调用对象的缺失值
df1

Unnamed: 0,a,b,c,d
0,0.956231,-1.230964,-0.115247,1.238308
1,0.951895,0.355957,0.016743,0.399602
2,0.967731,-0.663155,0.360556,0.860356


In [185]:
df2

Unnamed: 0,b,d,a
0,-0.31652,1.079503,-0.696327
1,-0.185494,-0.464592,1.623828


In [186]:
df1.combine_first(df2)

Unnamed: 0,a,b,c,d
0,0.956231,-1.230964,-0.115247,1.238308
1,0.951895,0.355957,0.016743,0.399602
2,0.967731,-0.663155,0.360556,0.860356


3. 重塑与透视————重点

重排表格型数据有多种基础操作，该操作被称为重塑或者透视

第一种：使用多层索引进行重塑

stack（堆叠）：该操作会“旋转”或将列中的数据透视到行

unstack（拆堆）：该操作会将行中的数据透视到列

In [192]:
data = pd.DataFrame(np.arange(6).reshape(2, 3),
                    index=pd.Index(['Ohio', 'Colorado'], name='state'),
                    columns=pd.Index(['one', 'two', 'three'],
                    name='number'))

In [193]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [194]:
result = data. stack()

In [195]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [196]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [197]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32

In [198]:
#指定拆堆的多层轴是哪一个，0在这里指state，1指number
result.unstack(0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [199]:
result.unstack('state')

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [200]:
result.unstack('number')

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [201]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys =['one', 'two'])

In [202]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: int64

In [203]:
data2.unstack()

Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2.0,3.0,
two,,,4.0,5.0,6.0


In [205]:
#堆叠和拆堆的过程是可逆的
#堆叠操作默认会过滤掉NA值，NA = np.nan
data2.unstack().stack()

one  a    0.0
     b    1.0
     c    2.0
     d    3.0
two  c    4.0
     d    5.0
     e    6.0
dtype: float64

In [209]:
#当在DF中拆堆时，别拆堆的层级会变为结果中最低级的层级
df = pd.DataFrame({'left': result,'right': result + 5},
                  columns=pd.Index(['left', 'right'], name = 'side'))

In [210]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [211]:
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [218]:
#在调用stack方法时，可以指明需要堆叠的轴向名称
df.unstack('state')

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [219]:
df.unstack('state').stack('side')

Unnamed: 0_level_0,state,Ohio,Colorado
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10


将长透视为宽

In [222]:
data = pd.read_csv('code/examples/macrodata.csv')
data.head()

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [223]:
periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')

In [224]:
columns = pd.Index(['relgdp', 'infl', 'unemp'], name='item')

In [225]:
data = data.reindex(columns=columns)

In [226]:
data.index = periods.to_timestamp('D', 'end')

In [227]:
ldata = data.stack().reset_index().rename(columns={0: 'value'})

In [228]:
#长类型，代表了每一个时间点对应单个观测值
ldata[:10]

Unnamed: 0,date,item,value
0,1959-03-31,infl,0.0
1,1959-03-31,unemp,5.8
2,1959-06-30,infl,2.34
3,1959-06-30,unemp,5.1
4,1959-09-30,infl,2.74
5,1959-09-30,unemp,5.3
6,1959-12-31,infl,0.27
7,1959-12-31,unemp,5.6
8,1960-03-31,infl,2.31
9,1960-03-31,unemp,5.2


In [229]:
#pivot函数
pivoted = ldata.pivot('date', 'item', 'value') #头两个参数为行和列，最后一个为数据项
pivoted


item,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1959-03-31,0.00,5.8
1959-06-30,2.34,5.1
1959-09-30,2.74,5.3
1959-12-31,0.27,5.6
1960-03-31,2.31,5.2
1960-06-30,0.14,5.2
1960-09-30,2.70,5.6
1960-12-31,1.21,6.3
1961-03-31,-0.40,6.8
1961-06-30,1.47,7.0


In [230]:
ldata['value2'] = np.random.randn(len(ldata))

In [231]:
ldata[:10]

Unnamed: 0,date,item,value,value2
0,1959-03-31,infl,0.0,1.118074
1,1959-03-31,unemp,5.8,1.405919
2,1959-06-30,infl,2.34,0.179848
3,1959-06-30,unemp,5.1,0.283357
4,1959-09-30,infl,2.74,-0.268157
5,1959-09-30,unemp,5.3,1.054954
6,1959-12-31,infl,0.27,0.788304
7,1959-12-31,unemp,5.6,1.440579
8,1960-03-31,infl,2.31,-0.092466
9,1960-03-31,unemp,5.2,-0.164614


In [232]:
pivoted = ldata.pivot('date', 'item')
pivoted[:5]

Unnamed: 0_level_0,value,value,value2,value2
item,infl,unemp,infl,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1959-03-31,0.0,5.8,1.118074,1.405919
1959-06-30,2.34,5.1,0.179848,0.283357
1959-09-30,2.74,5.3,-0.268157,1.054954
1959-12-31,0.27,5.6,0.788304,1.440579
1960-03-31,2.31,5.2,-0.092466,-0.164614


In [234]:
pivoted['value'][:5]

item,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1959-03-31,0.0,5.8
1959-06-30,2.34,5.1
1959-09-30,2.74,5.3
1959-12-31,0.27,5.6
1960-03-31,2.31,5.2


注意：pivot方法等价于使用set_index创建分层索引后，再调用unstack进行拆堆

In [235]:
unstacked = ldata.set_index(['date', 'item']).unstack('item')

In [237]:
unstacked[:7]

Unnamed: 0_level_0,value,value,value2,value2
item,infl,unemp,infl,unemp
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1959-03-31,0.0,5.8,1.118074,1.405919
1959-06-30,2.34,5.1,0.179848,0.283357
1959-09-30,2.74,5.3,-0.268157,1.054954
1959-12-31,0.27,5.6,0.788304,1.440579
1960-03-31,2.31,5.2,-0.092466,-0.164614
1960-06-30,0.14,5.2,-0.972041,-1.055571
1960-09-30,2.7,5.6,-0.071312,1.87896


In [240]:
unstacked.stack('item').stack().unstack('item')[:7]

Unnamed: 0_level_0,item,infl,unemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1959-03-31,value,0.0,5.8
1959-03-31,value2,1.118074,1.405919
1959-06-30,value,2.34,5.1
1959-06-30,value2,0.179848,0.283357
1959-09-30,value,2.74,5.3
1959-09-30,value2,-0.268157,1.054954
1959-12-31,value,0.27,5.6


将宽透视为长

pivot的反操作是pandas.melt，作用是将多个列合并为一列，产生一个新的DF

In [241]:
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
                   'A': [1, 2, 3],
                   'B': [4, 5, 6],
                   'C': [7, 8, 9]})

In [242]:
df

Unnamed: 0,A,B,C,key
0,1,4,7,foo
1,2,5,8,bar
2,3,6,9,baz


In [243]:
#使用melt必须指明那一列为分组指标，key参数
melted = pd.melt(df, ['key'])

In [244]:
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [251]:
reshaped = melted.pivot('key', 'variable', 'value')

In [256]:

reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [257]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [258]:
#可以指定列的子集作为值列
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [259]:
pd.melt(df, value_vars=['A', 'B', 'C'])

Unnamed: 0,variable,value
0,A,1
1,A,2
2,A,3
3,B,4
4,B,5
5,B,6
6,C,7
7,C,8
8,C,9


In [260]:
pd.melt(df, value_vars=['key', 'A', 'B'])

Unnamed: 0,variable,value
0,key,foo
1,key,bar
2,key,baz
3,A,1
4,A,2
5,A,3
6,B,4
7,B,5
8,B,6
