In [19]:
# Combining Dataset: concat and append
# combining data from different sources

import pandas as pd 
import numpy as np 
endl = "-"

In [2]:
# create a testing DataFrame 
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {
        c: [str(c) + str(i) for i in ind]
        for c in cols 
    }
    return pd.DataFrame(data, ind)

make_df("ABC",range(5))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4


In [3]:
# Recall: Concatenation of NumPy Arrays 
x = [1,2,3]
y = [4,5,6]
z = [7,8,9]
np.concatenate([x,y,z])


array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
x = [
    [1,2],
    [3,4]
]
np.concatenate([x,x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [5]:
# Simple Concatenation with pd.concat

ser1 = pd.Series(["A","B","C"], index=[1,2,3])
ser2 = pd.Series(["D","E","F"], index=[4,5,6])
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [10]:
# concatenate with higher dimensional arrays 
df1 = make_df("AB",[1,2])
df2 = make_df("AB",[3,4])
df1, "",df2,"", pd.concat([df1,df2]), "", pd.concat([df1,df2], axis="columns")

(    A   B
 1  A1  B1
 2  A2  B2,
 '',
     A   B
 3  A3  B3
 4  A4  B4,
 '',
     A   B
 1  A1  B1
 2  A2  B2
 3  A3  B3
 4  A4  B4,
 '',
      A    B    A    B
 1   A1   B1  NaN  NaN
 2   A2   B2  NaN  NaN
 3  NaN  NaN   A3   B3
 4  NaN  NaN   A4   B4)

In [11]:
# by default concatenation is a row-wise (i.e. axis = 0)
df3 = make_df("AB",[0,1])
df4 = make_df("CD",[0,1])  # notice difference in column names in compare to prev.ex
df3,"",df4,"",pd.concat([df3,df4],axis="columns")

(    A   B
 0  A0  B0
 1  A1  B1,
 '',
     C   D
 0  C0  D0
 1  C1  D1,
 '',
     A   B   C   D
 0  A0  B0  C0  D0
 1  A1  B1  C1  D1)

In [21]:
# Duplicate Indices 
# DataFrame allow repeated incides to store relation with 
# unioned frames (we working with the view after all)

x = make_df("AB",[0,1])
y = make_df("AB",[2,3])
y.index = x.index # make indices match 
(
    x,
    y,
    pd.concat([x,y])
)

(    A   B
 0  A0  B0
 1  A1  B1,
     A   B
 0  A2  B2
 1  A3  B3,
     A   B
 0  A0  B0
 1  A1  B1
 0  A2  B2
 1  A3  B3)

In [22]:
# treatening repeated indices as an error
try:
    pd.concat([x,y], verify_integrity=True)
except ValueError as e:
    print("ValueError",e)
    

ValueError Indexes have overlapping values: Index([0, 1], dtype='int64')


In [23]:
# Ignoring the index
(
    x, 
    y, 
    pd.concat([x,y], ignore_index=True) # index will be rebuilt
)

(    A   B
 0  A0  B0
 1  A1  B1,
     A   B
 0  A2  B2
 1  A3  B3,
     A   B
 0  A0  B0
 1  A1  B1
 2  A2  B2
 3  A3  B3)

In [24]:
# Adding MultiIndex Keys 
(
    x,
    y, 
    pd.concat([x,y], keys=["x","y"])    # index will'nt overlap anymore 
)

(    A   B
 0  A0  B0
 1  A1  B1,
     A   B
 0  A2  B2
 1  A3  B3,
       A   B
 x 0  A0  B0
   1  A1  B1
 y 0  A2  B2
   1  A3  B3)

In [25]:
# Concatenation with Joins 
df5 = make_df("ABC",[1,2])
df6 = make_df("BCD",[3,4])
(
    df5, 
    df6, 
    pd.concat([df5,df6])        # like "join" in SQL 
)

(    A   B   C
 1  A1  B1  C1
 2  A2  B2  C2,
     B   C   D
 3  B3  C3  D3
 4  B4  C4  D4,
      A   B   C    D
 1   A1  B1  C1  NaN
 2   A2  B2  C2  NaN
 3  NaN  B3  C3   D3
 4  NaN  B4  C4   D4)

In [26]:
# inner product -> intersection 
# outer product -> all what we get with resulted NaNs

pd.concat([df5,df6], join="inner")  # inner product

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [27]:
# finer control to dropped axises

# reindex will create Df with only columns from df5:
pd.concat([df5, df6.reindex(df5.columns,axis=1)]) 

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


In [30]:
df6.reindex(df5.columns,axis=1)

Unnamed: 0,A,B,C
3,,B3,C3
4,,B4,C4
