In [1]:
import pandas as pd
import numpy as np
# turn jupyter notebook warnings off
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Working example 1 - two dataframes with reconciliation columns containing non-duplicated elements
df1 = pd.DataFrame({'tradeid':range(5), 'profit':range(1000,2000,200)})
df2 = pd.DataFrame({'tradeid':range(2,7,1), 'stock':['APL','MST','JNJ','TSL','BAB']})
display(df1)
display(df2)

Unnamed: 0,tradeid,profit
0,0,1000
1,1,1200
2,2,1400
3,3,1600
4,4,1800


Unnamed: 0,tradeid,stock
0,2,APL
1,3,MST
2,4,JNJ
3,5,TSL
4,6,BAB


In [3]:
# Solution 1
# Since 'tradeid' is always unique, this is the best way to spot the "exclusive-or" differences between two dataframes

pd.concat([df1, df2], axis=0).drop_duplicates(subset='tradeid',keep=False)

Unnamed: 0,profit,stock,tradeid
0,1000.0,,0
1,1200.0,,1
3,,TSL,5
4,,BAB,6


In [4]:
# Solution 2
# Since 'tradeid' is always unique, we can also use below set function to find our differences

diff = set(df1.tradeid).symmetric_difference(set(df2.tradeid))

# use .isin() method to select the rows from a list. Also python set is not searchable by hash values; thus converting to list

pd.concat([df1.loc[df1['tradeid'].isin(list(diff)),:], df2.loc[df2['tradeid'].isin(list(diff)),:]])

Unnamed: 0,profit,stock,tradeid
0,1000.0,,0
1,1200.0,,1
3,,TSL,5
4,,BAB,6


In [5]:
# Solution 3
# Similar to solution 2, this time we are finding symmetric differences using np.array

diff = np.setxor1d(np.array(df1.tradeid), np.array(df2.tradeid))

pd.concat([df1.loc[df1['tradeid'].isin(list(diff)),:], df2.loc[df2['tradeid'].isin(list(diff)),:]])

Unnamed: 0,profit,stock,tradeid
0,1000.0,,0
1,1200.0,,1
3,,TSL,5
4,,BAB,6


In [6]:
# Working example 2 - two dataframes with reconciliation columns containing duplicated elements
df1 = pd.DataFrame({'tradeid':[0,2,2,3,4], 'profit':range(1000,2000,200)})
df2 = pd.DataFrame({'tradeid':[2,3,4,5,5], 'stock':['APL','MST','JNJ','TSL','BAB']})
display(df1)
display(df2)

Unnamed: 0,tradeid,profit
0,0,1000
1,2,1200
2,2,1400
3,3,1600
4,4,1800


Unnamed: 0,tradeid,stock
0,2,APL
1,3,MST
2,4,JNJ
3,5,TSL
4,5,BAB


In [7]:
# Solution 1 use the pandas merge function with indicator flag
df3 = df1.merge(df2, on='tradeid', how='outer', indicator=True)
display(df3)
df3 = df3.loc[df3["_merge"] != 'both', :]

# cleanup to make df3 our final result
del df3["_merge"]
df3

Unnamed: 0,tradeid,profit,stock,_merge
0,0,1000.0,,left_only
1,2,1200.0,APL,both
2,2,1400.0,APL,both
3,3,1600.0,MST,both
4,4,1800.0,JNJ,both
5,5,,TSL,right_only
6,5,,BAB,right_only


Unnamed: 0,tradeid,profit,stock
0,0,1000.0,
5,5,,TSL
6,5,,BAB


In [9]:
# Working example 3 - Finding symmetric differences for 2 lists containing duplicates
list1 = [0,1,1,2,3]
list2 = [0,1,3,4,4]

# Solution is embedded using functional programming
list3 = list(filter(lambda x:x not in list2, list1))
list4 = list(filter(lambda x:x not in list1, list2))
list3 + list4

# p.s. list comprehension does not work here somehow [list1.remove(x) for x in list1 if x in list2]

[2, 4, 4]

In [14]:
# Working example 4 - Combining data with overlaps
d1 = pd.DataFrame({'tradeid':[0,1,2],'profit':[1000,np.nan,2000],'stock':['APL','JNJ',np.nan]})
d2 = pd.DataFrame({'tradeid':[3,1,2],'profit':[1000,1500,np.nan],'stock':['APL',np.nan,np.nan]})
display(d1)
display(d2)

# subsequent dataset will combine overwrite the original one
d2.combine_first(d1)

Unnamed: 0,tradeid,profit,stock
0,0,1000.0,APL
1,1,,JNJ
2,2,2000.0,


Unnamed: 0,tradeid,profit,stock
0,3,1000.0,APL
1,1,1500.0,
2,2,,


Unnamed: 0,tradeid,profit,stock
0,3,1000.0,APL
1,1,1500.0,JNJ
2,2,2000.0,
