# Combining Datasets

In [11]:
A = {'Key1' : [1, 2, 3]}
print(A)
pd.DataFrame(A)

{'Key1': [1, 2, 3]}


Unnamed: 0,Key1
0,1
1,2
2,3


In [12]:
def make_df(cols, ind) :
    data = { c: [str(c) + str(i) for i in ind] for c in cols }
    return pd.DataFrame(data, ind)
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


## 1. concat

In [17]:
# 기본 식
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          
keys=None, levels=None, names=None, verify_integrity=False,

copy=True)

    # verify_integrity = verify that the indices do not overlap
    # ignore_index = just make new integer index

NameError: name 'objs' is not defined

In [18]:
# concat 1
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [19]:
# concat 2
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1); print(df2); print(pd.concat([df1, df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [20]:
# concat with axis
df3 = make_df('AB', [0, 1])
df4 = make_df('CD', [0, 1])
print(df3); print(df4); print(pd.concat([df3, df4], axis=1)) #axis 'col'

    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1
    A   B   C   D
0  A0  B0  C0  D0
1  A1  B1  C1  D1


In [21]:
# concat with different indices
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5); print(df6); print(pd.concat([df5, df6]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [22]:
# inner concat
pd.concat([df5, df6], join='inner')  # outer = union, inner = intersection

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [23]:
# join specification
pd.concat([df5, df6], join_axes=[df5.columns])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


In [24]:
# append() method
df1.append(df2) # same as pd.concat([df1, df2])

# it creates a new object with the combined data. 

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


## 2. Merge and Join

**Relational Algebra**
<br>pd.merge() is a subset of what is known as relational algebra, which is a formal set of rules for manipulating relational data, and forms the conceptual foundation of operations available in most databases.

#### 1. Categories of Joins
pd.merge() function implements a number of types of joins : one-to-one, many-to-one, many-to-many joins.

##### 1) One-to-one joins
Perhaps the simplest type of merge expression, and similar to the column-wise concatenation. 

In [25]:
df1 = pd.DataFrame( {'employee' : ['Bob', 'Jake', 'Lisa', 'Sue'],
                    'group' : ['Accounting', 'Engineering', 'Engineering', 'HR']})
df2 = pd.DataFrame( {'employee' : ['Lisa', 'Bob', 'Jake', 'Sue'],
                    'hire_data' : [2004, 2008, 2012, 2014]})
print(df1); print(df2)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_data
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014


In [26]:
df3 = pd.merge(df1, df2)
df3

Unnamed: 0,employee,group,hire_data
0,Bob,Accounting,2008
1,Jake,Engineering,2012
2,Lisa,Engineering,2004
3,Sue,HR,2014


##### 2) Many-to-one joins
One of the two key columns duplicate entries. For this case, the resulting DataFrame will preserve those duplicate entries as appropriate.

In [27]:
df4 = pd.DataFrame({'group': ['Accounting', 'Engineering', 'HR'],
                   'supervisor' : ['Carly', 'Guido', 'Steve']})
print(df3); print(df4); print(pd.merge(df3, df4))

  employee        group  hire_data
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014
         group supervisor
0   Accounting      Carly
1  Engineering      Guido
2           HR      Steve
  employee        group  hire_data supervisor
0      Bob   Accounting       2008      Carly
1     Jake  Engineering       2012      Guido
2     Lisa  Engineering       2004      Guido
3      Sue           HR       2014      Steve


##### 3) Many-to-many joins
If the key column in both the left and right array contains duplicates, then the result is a many-to-many merge.

In [28]:
df5 = pd.DataFrame({'group': ['Accounting', 'Accounting', 'Engineering', 'Engineering', 'HR', 'HR'],
                   'skills' : ['math', 'spreadsheets', 'coding', 'linux', 'spreadsheets', 'organization']})
print(df1,"\n"); print(df5)

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR 

         group        skills
0   Accounting          math
1   Accounting  spreadsheets
2  Engineering        coding
3  Engineering         linux
4           HR  spreadsheets
5           HR  organization


In [29]:
print(pd.merge(df1, df5))

  employee        group        skills
0      Bob   Accounting          math
1      Bob   Accounting  spreadsheets
2     Jake  Engineering        coding
3     Jake  Engineering         linux
4     Lisa  Engineering        coding
5     Lisa  Engineering         linux
6      Sue           HR  spreadsheets
7      Sue           HR  organization


#### 2. Specification of the Merge Key
Often, the column names will not match so nicely, and pd.merge() provides a variety of options for handling this.

##### 1) The 'on' keyword
You can explicitly specify the name of the key column using the on keyword, which takes a column name or a list of column names :


In [30]:
print(df1); print(df2)
print(pd.merge(df1, df2, on='employee'))

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
  employee  hire_data
0     Lisa       2004
1      Bob       2008
2     Jake       2012
3      Sue       2014
  employee        group  hire_data
0      Bob   Accounting       2008
1     Jake  Engineering       2012
2     Lisa  Engineering       2004
3      Sue           HR       2014


##### 2) The left_on and right_on keywords
You may wish to mergo two datasets with different names :

In [31]:
df3 = pd.DataFrame( {'name':['Bob', 'Jake', 'Lisa', 'Sue'],
                    'salary' : [70000, 80000, 120000, 90000]})
print(df1); print(df3)
print(pd.merge(df1, df3, left_on="employee", right_on="name"))

# the result has redundant column (name, employee), and we can drop it

  employee        group
0      Bob   Accounting
1     Jake  Engineering
2     Lisa  Engineering
3      Sue           HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
  employee        group  name  salary
0      Bob   Accounting   Bob   70000
1     Jake  Engineering  Jake   80000
2     Lisa  Engineering  Lisa  120000
3      Sue           HR   Sue   90000


In [32]:
pd.merge(df1, df3, left_on="employee", right_on="name").drop('name', axis=1)

Unnamed: 0,employee,group,salary
0,Bob,Accounting,70000
1,Jake,Engineering,80000
2,Lisa,Engineering,120000
3,Sue,HR,90000


##### 3) The left_index and right_index keywords
Sometimes, you would like to merge on an index

In [33]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [34]:
df1a = df1.set_index('employee')
df2a = df2.set_index('employee')
print(df1a); print(df2a)
print(pd.merge(df1a, df2a, left_index=True, right_index=True))

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
          hire_data
employee           
Lisa           2004
Bob            2008
Jake           2012
Sue            2014
                group  hire_data
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


In [35]:
# same as
print(df1a.join(df2a))

                group  hire_data
employee                        
Bob        Accounting       2008
Jake      Engineering       2012
Lisa      Engineering       2004
Sue                HR       2014


##### 4) Combination of index and column

In [36]:
print(df1a); print(df3)
print(pd.merge(df1a, df3, left_index=True, right_on='name'))

                group
employee             
Bob        Accounting
Jake      Engineering
Lisa      Engineering
Sue                HR
   name  salary
0   Bob   70000
1  Jake   80000
2  Lisa  120000
3   Sue   90000
         group  name  salary
0   Accounting   Bob   70000
1  Engineering  Jake   80000
2  Engineering  Lisa  120000
3           HR   Sue   90000


#### 3. Specifying Set Arithmetic for Joins

##### -innerjoin (default)

In [46]:
# default : innerjoin
# This is "inner join" : the result contains the intersection of the two sets.
df6 = pd.DataFrame({'name' : ['Peter', 'Paul', 'Mary'],
                   'food' : ['fish', 'beans', 'bread']},
                   columns= ['name' ,'food'])
df7 = pd.DataFrame({'name' : ['Mary', 'Joseph'],
                   'drink' : ['wine' ,'beer']})
print(df6); print(df7)
print(pd.merge(df6, df7))

    name   food
0  Peter   fish
1   Paul  beans
2   Mary  bread
  drink    name
0  wine    Mary
1  beer  Joseph
   name   food drink
0  Mary  bread  wine


In [38]:
# same as
print(pd.merge(df6, df7, how='inner'))

   name   food drink
0  Mary  bread  wine


##### -outer join
returns a join over the *union* of the input columns, and fills in all missing values with NAs.

In [39]:
print(pd.merge(df6, df7, how='outer'))

     name   food drink
0   Peter   fish   NaN
1    Paul  beans   NaN
2    Mary  bread  wine
3  Joseph    NaN  beer


##### -left & Right Join
returns a join over the left entries and right entries, respectively

In [40]:
print(pd.merge(df6, df7, how='left'))

    name   food drink
0  Peter   fish   NaN
1   Paul  beans   NaN
2   Mary  bread  wine


if joins have same columns with overlapping name, they add suffix
<br>and we can specify suffix as well.

In [41]:
df8 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [1, 2, 3, 4]})
df9 = pd.DataFrame({'name': ['Bob', 'Jake', 'Lisa', 'Sue'],
'rank': [3, 1, 4, 2]})
print(df8); print(df9); print(pd.merge(df8, df9, on="name"))

   name  rank
0   Bob     1
1  Jake     2
2  Lisa     3
3   Sue     4
   name  rank
0   Bob     3
1  Jake     1
2  Lisa     4
3   Sue     2
   name  rank_x  rank_y
0   Bob       1       3
1  Jake       2       1
2  Lisa       3       4
3   Sue       4       2


In [42]:
pd.merge(df8, df9, on="name", suffixes=["_L", "_R"])

Unnamed: 0,name,rank_L,rank_R
0,Bob,1,3
1,Jake,2,1
2,Lisa,3,4
3,Sue,4,2
