In [10]:
import pandas as pd
import numpy as np

### Database-style DataFrame Merges

In [4]:
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'a', 'c', 'c', 'b'], 'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'], 'data2': range(3)})
df1.reindex(columns = ['key', 'data1'])

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,a,3
4,c,4
5,c,5
6,b,6


In [5]:
df2.reindex(columns = ['key', 'data2'])

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [10]:
pd.merge(df1, df2, on = 'key', how = 'outer')  
# use left_on & right_on to choose columns to join on when the columns' name is different.
# when merging on index, use left_index/right_index = True.

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,1.0,b,1.0
2,6.0,b,1.0
3,2.0,a,0.0
4,3.0,a,0.0
5,4.0,c,
6,5.0,c,
7,,d,2.0


### Concatenating Along an Axis

In [2]:
s1 = pd.Series([0, 1], index = ['a', 'b'])
s2 = pd.Series([2, 3, 4], index = ['c', 'd', 'e'])
s3 = pd.Series([5, 6, 7, 8], index = ['f', 'g', 'h', 'i'])
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
h    7
i    8
dtype: int64

In [3]:
pd.concat([s1, s2, s3], axis = 1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0
h,,,7.0
i,,,8.0


In [4]:
s4 = s1 * 5
s4

a    0
b    5
dtype: int64

In [6]:
s5 = pd.concat([s3, s4])
s5

f    5
g    6
h    7
i    8
a    0
b    5
dtype: int64

In [9]:
pd.concat([s1, s5], axis = 1, join = 'inner')

Unnamed: 0,0,1
a,0,0
b,1,5


In [12]:
df1 = pd.DataFrame(np.random.randn(3, 4), columns = ['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2, 3), columns = ['a', 'b', 'd'])
df1

Unnamed: 0,a,b,c,d
0,-0.337732,0.025025,-1.389683,0.037737
1,-0.898943,0.01654,1.760913,-1.222883
2,0.051743,-0.765794,-0.433283,-0.853851


In [13]:
df2

Unnamed: 0,a,b,d
0,-1.330108,0.605168,0.946056
1,1.659237,-0.544651,-0.487908


In [14]:
pd.concat([df1, df2])

Unnamed: 0,a,b,c,d
0,-0.337732,0.025025,-1.389683,0.037737
1,-0.898943,0.01654,1.760913,-1.222883
2,0.051743,-0.765794,-0.433283,-0.853851
0,-1.330108,0.605168,,0.946056
1,1.659237,-0.544651,,-0.487908


In [15]:
pd.concat([df1, df2], ignore_index = True)

Unnamed: 0,a,b,c,d
0,-0.337732,0.025025,-1.389683,0.037737
1,-0.898943,0.01654,1.760913,-1.222883
2,0.051743,-0.765794,-0.433283,-0.853851
3,-1.330108,0.605168,,0.946056
4,1.659237,-0.544651,,-0.487908


### Combining Data with Overlap

In [16]:
df3 = pd.DataFrame({'a': [1, np.nan, 5, np.nan], 
                    'b': [np.nan, 2, np.nan, 6], 
                    'c': range(2, 18, 4)})
df3

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [17]:
df4 = pd.DataFrame({'a': [5, 4, np.nan, 3, 7], 
                    'b': [np.nan, 3, 4, 6, 8]})
df4

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [18]:
df3.combine_first(df4)  
# use non-null values first when there is a null value
# use df3 first when both of them have values

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,
