In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv(r'/Users/timothypark/Documents/portfolios/timpark99.github.io/Merging Dataframes in Pandas/100.csv')
df1

Unnamed: 0,Employee ID,Name,Played By
0,1001,Gale Cleven,Austiin Butler
1,1002,John Egan,Callum Turner
2,1003,Harry Crosby,Anthony Boyle
3,1004,Robert Rosenthal,Nate Mann


In [3]:
df2 = pd.read_csv(r'/Users/timothypark/Documents/portfolios/timpark99.github.io/Merging Dataframes in Pandas/100 v2.csv')
df2

Unnamed: 0,Employee ID,Name,Age
0,1001,Gale Cleven,32
1,1002,John Egan,34
2,1006,Curtis Biddick,31
3,1007,Harold Huglin,47
4,1008,Marvin Bowman,44


In [8]:
# the df left of merge is automatically going to be our left dataframe
# the df in the parentheses is our right dataframe
# this defaults to an inner join
# x and y show up for Name because we're not merging on that column

df1.merge(df2, how='inner', on='Employee ID')

Unnamed: 0,Employee ID,Name_x,Played By,Name_y,Age
0,1001,Gale Cleven,Austiin Butler,Gale Cleven,32
1,1002,John Egan,Callum Turner,John Egan,34


In [9]:
# if we create a list on the "on", it'll look like the previous example

df1.merge(df2, how='inner', on=['Employee ID', 'Name'])

Unnamed: 0,Employee ID,Name,Played By,Age
0,1001,Gale Cleven,Austiin Butler,32
1,1002,John Egan,Callum Turner,34


In [10]:
# outer join
# NaN means not a member

df1.merge(df2, how='outer')

Unnamed: 0,Employee ID,Name,Played By,Age
0,1001,Gale Cleven,Austiin Butler,32.0
1,1002,John Egan,Callum Turner,34.0
2,1003,Harry Crosby,Anthony Boyle,
3,1004,Robert Rosenthal,Nate Mann,
4,1006,Curtis Biddick,,31.0
5,1007,Harold Huglin,,47.0
6,1008,Marvin Bowman,,44.0


In [11]:
# left join

df1.merge(df2, how='left')

Unnamed: 0,Employee ID,Name,Played By,Age
0,1001,Gale Cleven,Austiin Butler,32.0
1,1002,John Egan,Callum Turner,34.0
2,1003,Harry Crosby,Anthony Boyle,
3,1004,Robert Rosenthal,Nate Mann,


In [12]:
# right join

df1.merge(df2, how='right')

Unnamed: 0,Employee ID,Name,Played By,Age
0,1001,Gale Cleven,Austiin Butler,32
1,1002,John Egan,Callum Turner,34
2,1006,Curtis Biddick,,31
3,1007,Harold Huglin,,47
4,1008,Marvin Bowman,,44


In [13]:
# cross join takes each value from left dataframe and compares it to each value in the right dataframe
# very few reasons for a cross join

df1.merge(df2, how='cross')

Unnamed: 0,Employee ID_x,Name_x,Played By,Employee ID_y,Name_y,Age
0,1001,Gale Cleven,Austiin Butler,1001,Gale Cleven,32
1,1001,Gale Cleven,Austiin Butler,1002,John Egan,34
2,1001,Gale Cleven,Austiin Butler,1006,Curtis Biddick,31
3,1001,Gale Cleven,Austiin Butler,1007,Harold Huglin,47
4,1001,Gale Cleven,Austiin Butler,1008,Marvin Bowman,44
5,1002,John Egan,Callum Turner,1001,Gale Cleven,32
6,1002,John Egan,Callum Turner,1002,John Egan,34
7,1002,John Egan,Callum Turner,1006,Curtis Biddick,31
8,1002,John Egan,Callum Turner,1007,Harold Huglin,47
9,1002,John Egan,Callum Turner,1008,Marvin Bowman,44


In [16]:
# join does not have default values like merge, so we must type everything in
# join is better when you're working with indexes

df1.join(df2, on='Employee ID', how='outer', lsuffix='_Left', rsuffix='_Right')

Unnamed: 0,Employee ID,Employee ID_Left,Name_Left,Played By,Employee ID_Right,Name_Right,Age
0.0,1001,1001.0,Gale Cleven,Austiin Butler,,,
1.0,1002,1002.0,John Egan,Callum Turner,,,
2.0,1003,1003.0,Harry Crosby,Anthony Boyle,,,
3.0,1004,1004.0,Robert Rosenthal,Nate Mann,,,
,0,,,,1001.0,Gale Cleven,32.0
,1,,,,1002.0,John Egan,34.0
,2,,,,1006.0,Curtis Biddick,31.0
,3,,,,1007.0,Harold Huglin,47.0
,4,,,,1008.0,Marvin Bowman,44.0


In [18]:
# join by using index

df4 = df1.set_index('Employee ID').join(df2.set_index('Employee ID'), lsuffix='_Left', rsuffix='_Right', how='outer')
df4

Unnamed: 0_level_0,Name_Left,Played By,Name_Right,Age
Employee ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1001,Gale Cleven,Austiin Butler,Gale Cleven,32.0
1002,John Egan,Callum Turner,John Egan,34.0
1003,Harry Crosby,Anthony Boyle,,
1004,Robert Rosenthal,Nate Mann,,
1006,,,Curtis Biddick,31.0
1007,,,Harold Huglin,47.0
1008,,,Marvin Bowman,44.0


In [28]:
# concatenate is like putting one dataframe on top of another
# inner only keeps columns that are the same
# outer keeps all columns
# 0 axis is lefthand index
# 1 axis is the top index which are columns

pd.concat([df1,df2], join='outer', axis=1)

Unnamed: 0,Employee ID,Name,Played By,Employee ID.1,Name.1,Age
0,1001.0,Gale Cleven,Austiin Butler,1001,Gale Cleven,32
1,1002.0,John Egan,Callum Turner,1002,John Egan,34
2,1003.0,Harry Crosby,Anthony Boyle,1006,Curtis Biddick,31
3,1004.0,Robert Rosenthal,Nate Mann,1007,Harold Huglin,47
4,,,,1008,Marvin Bowman,44
