# Concatenation

http://pandas.pydata.org/pandas-docs/stable/merging.html

In [1]:
import pandas as pd
import numpy as np

## 1. Combine data frames along the rows

In [2]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
   ...:                     'B': ['B0', 'B1', 'B2', 'B3'],
   ...:                     'C': ['C0', 'C1', 'C2', 'C3'],
   ...:                     'D': ['D0', 'D1', 'D2', 'D3']},
   ...:                     index=[0, 1, 2, 3])

In [3]:
 df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
   ...:                     'B': ['B4', 'B5', 'B6', 'B7'],
   ...:                     'C': ['C4', 'C5', 'C6', 'C7'],
   ...:                     'D': ['D4', 'D5', 'D6', 'D7']},
   ...:                      index=[4, 5, 6, 7])

In [4]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
   ...:                     'B': ['B8', 'B9', 'B10', 'B11'],
   ...:                     'C': ['C8', 'C9', 'C10', 'C11'],
   ...:                     'D': ['D8', 'D9', 'D10', 'D11']},
   ...:                     index=[8, 9, 10, 11])

In [5]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [6]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [7]:
df3

Unnamed: 0,A,B,C,D
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10
11,A11,B11,C11,D11


In [10]:
frames = [df1, df2, df3]
pd.concat(frames)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [12]:
pd.concat([df1, df2.loc[:, ['D','C','B','A']]])

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [14]:
df2.loc[:, ['D','C','B','A']]

Unnamed: 0,D,C,B,A
4,D4,C4,B4,A4
5,D5,C5,B5,A5
6,D6,C6,B6,A6
7,D7,C7,B7,A7


## 2. Combine data frames along the columns

In [15]:
df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
   ...:                  'D': ['D2', 'D3', 'D6', 'D7'],
   ...:                  'F': ['F2', 'F3', 'F6', 'F7']},
   ...:                 index=[2, 3, 6, 7])
df4

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6
7,B7,D7,F7


In [16]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [17]:
# use axis=1, default is outer join
pd.concat([df1, df4], axis=1)

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [18]:
# inner join on common index (row labels)
pd.concat([df1, df4], axis=1, join='inner')

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


In [19]:
# left join
pd.concat([df1, df4], axis=1, join_axes=[df1.index])

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


## 3. Concatenating along rows using append

In [20]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [21]:
df2

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [22]:
# append one
df1.append(df2)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [23]:
# append a list of data frames
df1.append([df2, df4])

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,A4,B4,C4,D4,
5,A5,B5,C5,D5,
6,A6,B6,C6,D6,
7,A7,B7,C7,D7,
2,,B2,,D2,F2
3,,B3,,D3,F3


In [40]:
# ignoring index
df1.append([df2, df4], ignore_index=True)

Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,A4,B4,C4,D4,
5,A5,B5,C5,D5,
6,A6,B6,C6,D6,
7,A7,B7,C7,D7,
8,,B2,,D2,F2
9,,B3,,D3,F3


## 4. Concat data frames with series

### 4.1 Add as a new column

In [24]:
s1 = pd.Series(['X0', 'X1', 'X2', 'X3'], name='X')
s1

0    X0
1    X1
2    X2
3    X3
Name: X, dtype: object

In [25]:
# Along the columns, works fine
pd.concat([df1, s1], axis=1)

Unnamed: 0,A,B,C,D,X
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


### 4.2 Add as a new row

In [29]:
# along the rows, not working properly
# series is treated as a column vector
pd.concat([df1, s1])

Unnamed: 0,A,B,C,D,0
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
0,,,,,X0
1,,,,,X1
2,,,,,X2
3,,,,,X3


In [28]:
df1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3


In [31]:
# how about using append which dedicated to row concatenation?
df1.append(s1)

Unnamed: 0,A,B,C,D,0,1,2,3
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
X,,,,,X0,X1,X2,X3


In [32]:
s1

0    X0
1    X1
2    X2
3    X3
Name: X, dtype: object

In [36]:
# adding aligned index for the target series
s1.index = df1.columns
df1.append(s1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
X,X0,X1,X2,X3


In [37]:
s1

A    X0
B    X1
C    X2
D    X3
Name: X, dtype: object

### 4.3 Alternative ways to add columns

In [44]:
# is there any function dedicated to adding new columns
df1.assign(E=list('XYZW'))

Unnamed: 0,A,B,C,D,E
0,A0,B0,C0,D0,X
1,A1,B1,C1,D1,Y
2,A2,B2,C2,D2,Z
3,A3,B3,C3,D3,W


In [45]:
df1.assign?

In [46]:
df1.assign(E=list('XYZW'), F=range(4))

Unnamed: 0,A,B,C,D,E,F
0,A0,B0,C0,D0,X,0
1,A1,B1,C1,D1,Y,1
2,A2,B2,C2,D2,Z,2
3,A3,B3,C3,D3,W,3


In [47]:
# an even easier way (for columns only)
df1['G'] = None
df1

Unnamed: 0,A,B,C,D,G
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,


# 5. Concat with group keys

### 5.1 Along the rows

In [49]:
result = pd.concat([df1, df2, df3], keys=list('xyz'))
result

Unnamed: 0,Unnamed: 1,A,B,C,D,G
x,0,A0,B0,C0,D0,
x,1,A1,B1,C1,D1,
x,2,A2,B2,C2,D2,
x,3,A3,B3,C3,D3,
y,4,A4,B4,C4,D4,
y,5,A5,B5,C5,D5,
y,6,A6,B6,C6,D6,
y,7,A7,B7,C7,D7,
z,8,A8,B8,C8,D8,
z,9,A9,B9,C9,D9,


In [50]:
result.index

MultiIndex(levels=[['x', 'y', 'z'], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]],
           labels=[[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]])

In [51]:
result.loc['x']

Unnamed: 0,A,B,C,D,G
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,


In [52]:
result.loc['x'].loc[[0,3],list('AC')]

Unnamed: 0,A,C
0,A0,C0
3,A3,C3


In [84]:
# An alternative
pd.concat({'x': df1, 'y': df2, 'z': df3})

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [85]:
# subsetting using keys
pd.concat({'x': df1, 'y': df2, 'z': df3}, keys=list('xy'))

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7


### 5.2 Along the columns

In [80]:
s3 = pd.Series([0, 1, 2, 3], name='foo')
s4 = pd.Series([0, 1, 2, 3])
s5 = pd.Series([0, 1, 4, 5])
pd.concat([s3, s4, s5], axis=1)

Unnamed: 0,foo,0,1
0,0,0,0
1,1,1,1
2,2,2,4
3,3,3,5


In [81]:
 pd.concat([s3, s4, s5], axis=1, keys=['red','blue','yellow'])

Unnamed: 0,red,blue,yellow
0,0,0,0
1,1,1,1
2,2,2,4
3,3,3,5


In [None]:
# try it when s3, s4, s5 all have two columns
# also try use dictinary as input