In [1]:
import pandas as pd
import numpy as np
import datetime

### 合并客户表与订单表

In [9]:
customers = {'CustomerID': [10, 11],
             'Name': ['Mike', 'Marcia'],
             'Address': ['Address for Mike','Address for Marcia']}
customers = pd.DataFrame(customers)
customers

Unnamed: 0,Address,CustomerID,Name
0,Address for Mike,10,Mike
1,Address for Marcia,11,Marcia


In [10]:
orders = {'CustomerID': [10, 11, 10],
          'OrderDate': [datetime.date(2016, 12, 1),
                        datetime.date(2016, 12, 1),
                        datetime.date(2016, 12, 2)]}
orders = pd.DataFrame(orders)
orders

Unnamed: 0,CustomerID,OrderDate
0,10,2016-12-01
1,11,2016-12-01
2,10,2016-12-02


In [11]:
customers.merge(orders)

Unnamed: 0,Address,CustomerID,Name,OrderDate
0,Address for Mike,10,Mike,2016-12-01
1,Address for Mike,10,Mike,2016-12-02
2,Address for Marcia,11,Marcia,2016-12-01


In [18]:
customers.merge?

In [13]:
left_data = {'key1': ['a', 'b', 'c'],
             'key2': ['x', 'y', 'z'],
             'lval1': [ 0, 1, 2]}
right_data = {'key1': ['a', 'b', 'c'],
              'key2': ['x', 'a', 'z'],
              'rval1': [ 6, 7, 8 ]}
left = pd.DataFrame(left_data, index=[0, 1, 2])
right = pd.DataFrame(right_data, index=[1, 2, 3])
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [14]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [15]:
left.merge(right)

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [16]:
left.merge(right,on='key1')

Unnamed: 0,key1,key2_x,lval1,key2_y,rval1
0,a,x,0,x,6
1,b,y,1,a,7
2,c,z,2,z,8


In [17]:
left.merge(right, on=['key1', 'key2'])

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6
1,c,z,2,8


In [19]:
#按照index来merge
pd.merge(left, right, left_index=True, right_index=True)

Unnamed: 0,key1_x,key2_x,lval1,key1_y,key2_y,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


### merge进阶

+ inner :两个DataFrame的key交集
+ outer：两个DataFrame的key并集 
+ left： 只使用左边的DataFrame的key
+ right : 只使用右边的DataFrame的key

In [19]:
left

Unnamed: 0,key1,key2,lval1
0,a,x,0
1,b,y,1
2,c,z,2


In [20]:
right

Unnamed: 0,key1,key2,rval1
1,a,x,6
2,b,a,7
3,c,z,8


In [21]:
left.merge(right,how='outer')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6.0
1,b,y,1.0,
2,c,z,2.0,8.0
3,b,a,,7.0


In [22]:
left.merge(right,how='left')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0,6.0
1,b,y,1,
2,c,z,2,8.0


In [24]:
left.merge(right,how='right')

Unnamed: 0,key1,key2,lval1,rval1
0,a,x,0.0,6
1,c,z,2.0,8
2,b,a,,7


### join

使用index label

In [25]:
left.join?

In [23]:
left.join(right, lsuffix='_left', rsuffix='_right')  #how='left',这里如果有相同列需要给出后缀

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0,,,
1,b,y,1,a,x,6.0
2,c,z,2,b,a,7.0


In [26]:
left.join(right,how='outer', lsuffix='_left', rsuffix='_right') 

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
0,a,x,0.0,,,
1,b,y,1.0,a,x,6.0
2,c,z,2.0,b,a,7.0
3,,,,c,z,8.0


In [26]:
left.join?

In [27]:
left.join(right, lsuffix='_left', rsuffix='_right', how='inner')

Unnamed: 0,key1_left,key2_left,lval1,key1_right,key2_right,rval1
1,b,y,1,a,x,6
2,c,z,2,b,a,7


### concat

In [28]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),columns=['a', 'b', 'c'])

In [29]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [30]:
df2

Unnamed: 0,a,b,c
0,9,10,11
1,12,13,14
2,15,16,17


In [31]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
0,9,10,11
1,12,13,14
2,15,16,17


In [32]:
df1 = pd.DataFrame(np.arange(9).reshape(3, 3),columns=['a', 'b', 'c'])
df2 = pd.DataFrame(np.arange(9, 18).reshape(3, 3),columns=['a', 'c', 'd'])

In [33]:
df1

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8


In [34]:
df2

Unnamed: 0,a,c,d
0,9,10,11
1,12,13,14
2,15,16,17


In [35]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,0,1.0,2,
1,3,4.0,5,
2,6,7.0,8,
0,9,,10,11.0
1,12,,13,14.0
2,15,,16,17.0


In [33]:
c = pd.concat([df1, df2], keys=['df1', 'df2'])
c

Unnamed: 0,Unnamed: 1,a,b,c,d
df1,0,0,1.0,2,
df1,1,3,4.0,5,
df1,2,6,7.0,8,
df2,0,9,,10,11.0
df2,1,12,,13,14.0
df2,2,15,,16,17.0


In [39]:
c.ix['df1']

a    0.0
b    1.0
c    2.0
d    NaN
Name: 0, dtype: float64

In [39]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,a,b,c,a.1,c.1,d
0,0,1,2,9,10,11
1,3,4,5,12,13,14
2,6,7,8,15,16,17
