In [1]:
import pandas as pd
import numpy as np

In [None]:
# Catenating datasets

In [7]:
# Helper function to create DataFrames

def makedf(cols, ind):
    data = {c : [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

In [8]:
a=makedf("AB", [0,1])
a

Unnamed: 0,A,B
0,A0,B0
1,A1,B1


In [9]:
b = makedf("AB", [2,3])
b

Unnamed: 0,A,B
2,A2,B2
3,A3,B3


In [10]:
c = makedf("CD", [0,1])
c

Unnamed: 0,C,D
0,C0,D0
1,C1,D1


In [12]:
d = makedf("BC", [2,3])
d

Unnamed: 0,B,C
2,B2,C2
3,B3,C3


In [14]:
pd.concat([a,b])

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


In [17]:
# creates duplicate indices
r = pd.concat([a,a])
r

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A0,B0
1,A1,B1


In [19]:
# Three solutions to this:

# Firstly, deny creation of duplicated indices by giving the verify_integrity parameter 
# to the concat function:

try:
    pd.concat([a,a], verify_integrity=True)
except ValueError as e:
    import sys
    print(e, file=sys.stderr)

Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


In [20]:
# Secondly, we can ask for automatic renumbering of rows:

pd.concat([a,a], ignore_index=True)

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


In [21]:
"""Thirdly, we can ask for hierarchical indexing. The indices can contain multiple levels, 
but on this course we don’t consider hierarchical indices in detail. Hierarchical indices 
can make a two dimensional array to work like higher dimensional array."""

r2=pd.concat([a,a], keys=['first', 'second'])
r2

Unnamed: 0,Unnamed: 1,A,B
first,0,A0,B0
first,1,A1,B1
second,0,A0,B0
second,1,A1,B1


In [22]:
r2["A"]["first"][0]

'A0'

In [24]:
# catenating horizontally

pd.concat([a,c], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


In [25]:
# sort option is used to silence a deprecation message

pd.concat([a,d], sort=False)

Unnamed: 0,A,B,C
0,A0,B0,
1,A1,B1,
2,,B2,C2
3,,B3,C3


In [None]:
"""It expanded the non-existing cases with NaNs. This method is called an outer join, 
which forms the union of columns in the two DataFrames."""

In [26]:
# The alternative is inner join, which forms the intersection of columns:

pd.concat([a,d], join="inner")

Unnamed: 0,B
0,B0
1,B1
2,B2
3,B3
