In [1]:
import pandas as pd

In [2]:
import numpy as np

In [3]:
sales1 = pd.read_csv('sales.csv', index_col='Unnamed: 0')

In [4]:
sales1

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,23,33
Mike,45,9,74,87,12
Andi,17,33,54,8,29
Paul,87,67,27,45,7


In [5]:
sales2 = sales1.copy()

In [6]:
sales2.iloc[0, 1] = 100 #changing values in sales2
sales2.iloc[3, 2] = 200

In [7]:
sales2

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,100,15,23,33
Mike,45,9,74,87,12
Andi,17,33,54,8,29
Paul,87,67,200,45,7


If we compare the two dataframes with `==`, we will get `False` returned where the value-pairs differ.

In [8]:
sales1 == sales2

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,True,False,True,True,True
Mike,True,True,True,True,True
Andi,True,True,True,True,True
Paul,True,True,False,True,True


Or we can use `.where()` and a `~` combinded with the above comparison to show the values of either dataframe where they are different (it will show whichever values exist in the dataframe that we call the method on)

In [9]:
sales1.where(~(sales1 == sales2))

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,,27.0,,,
Mike,,,,,
Andi,,,,,
Paul,,,27.0,,


In [10]:
sales2.where(~(sales1 == sales2))

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,,100.0,,,
Mike,,,,,
Andi,,,,,
Paul,,,200.0,,


#### We can also concat the dataframes and use a Python function to highlight areas where there are differences.

In [11]:
sales_comp = pd.concat([sales1, sales2], axis = 1,
                      keys= ['Day1', 'Day2'])
sales_comp

Unnamed: 0_level_0,Day1,Day1,Day1,Day1,Day1,Day2,Day2,Day2,Day2,Day2
Unnamed: 0_level_1,Mon,Tue,Wed,Thu,Fri,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,23,33,34,100,15,23,33
Mike,45,9,74,87,12,45,9,74,87,12
Andi,17,33,54,8,29,17,33,54,8,29
Paul,87,67,27,45,7,87,67,200,45,7


In [12]:
def highlight_diff(data, color='yellow'):
    attr = 'background-color: {}'.format(color)
    other = data.xs('Day1', axis='columns', level=-2)
    return pd.DataFrame(np.where(data.ne(other, level=1), attr, ''),
                       index=data.index, columns=data.columns)

In [13]:
sales_comp.style.apply(highlight_diff, axis=None)

Unnamed: 0_level_0,Day1,Day1,Day1,Day1,Day1,Day2,Day2,Day2,Day2,Day2
Unnamed: 0_level_1,Mon,Tue,Wed,Thu,Fri,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,23,33,34,100,15,23,33
Mike,45,9,74,87,12,45,9,74,87,12
Andi,17,33,54,8,29,17,33,54,8,29
Paul,87,67,27,45,7,87,67,200,45,7


In [14]:
def diff_pd(df1, df2):
    """Identify differences between two pandas DataFrames"""
    assert (df1.columns == df2.columns).all(), \
        "DataFrame column names are different"
    if any(df1.dtypes != df2.dtypes):
        "Data Types are different, trying to convert"
        df2 = df2.astype(df1.dtypes)
    if df1.equals(df2):
        return None
    else:
        # need to account for np.nan != np.nan returning True
        diff_mask = (df1 != df2) & ~(df1.isnull() & df2.isnull())
        ne_stacked = diff_mask.stack()
        changed = ne_stacked[ne_stacked]
        changed.index.names = ['id', 'col']
        difference_locations = np.where(diff_mask)
        changed_from = df1.values[difference_locations]
        changed_to = df2.values[difference_locations]
        return pd.DataFrame({'from': changed_from, 'to': changed_to},
                            index=changed.index)

In [15]:
diff_pd(sales1, sales2)

Unnamed: 0_level_0,Unnamed: 1_level_0,from,to
id,col,Unnamed: 2_level_1,Unnamed: 3_level_1
Steven,Tue,27,100
Paul,Wed,27,200
