In [None]:
# default_exp core

# deltaframe

> Builds the delta between two or more pandas dataframes

In [None]:
#hide
from nbdev.imports import *
from nbdev.showdoc import *

In [None]:
#export
import pandas as pd
import numpy as np

In [None]:
#export
def get_added_rows(df_old, df_new, unique_id, trans_col="transaction", trans_vaL="added"):
    """get rows in df2 which are not in df1"""
    cols = list(df_old.columns)
    new_rows = (pd
                .merge(df_old, df_new, how="outer", on=unique_id, indicator=True, suffixes=("_foo",""))
                .query('_merge == "right_only"')
                )
    new_rows = new_rows[cols] 
    new_rows[trans_col] = trans_vaL
    return new_rows

In [None]:
df1=pd.DataFrame({
    "date":["2020-11-24", "2020-11-24","2020-11-24","2020-11-24"],
    "id":["001","002","003","004"],
    "quantity":[22,8,7,10],
    "color":["Yellow","Orange","Red","Yellow"],
})
df2=pd.DataFrame({
    "date":["2020-11-24","2020-11-25","2020-11-24","2020-11-24", "2020-11-25" ],
    "id":["001","002", "004", "005", "001"],
    "quantity":[22,6,5,10,22],
    "color":["Yellow","Orange","Red","Pink", "Yellow"],
})

In [None]:
df1

Unnamed: 0,date,id,quantity,color
0,2020-11-24,1,22,Yellow
1,2020-11-24,2,8,Orange
2,2020-11-24,3,7,Red
3,2020-11-24,4,10,Yellow


In [None]:
df2

Unnamed: 0,date,id,quantity,color
0,2020-11-24,1,22,Yellow
1,2020-11-25,2,6,Orange
2,2020-11-24,4,5,Red
3,2020-11-24,5,10,Pink
4,2020-11-25,1,22,Yellow


In [None]:
added_rows = get_added_rows(df_old=df1, df_new=df2, unique_id="id")
df_added = pd.DataFrame({"date":["2013-11-24"],"id":["005"],"quantity":[10],"color":["Pink"],"transaction":["added"]})
test_eq(added_rows, df_added)

In [None]:
#export
def get_removed_rows(df_old, df_new, unique_id, trans_col="transaction", trans_val="removed"):
    """Returns the removed rows that are not any longer in df_new"""
    cols = list(df_old.columns)
    removed_rows = (pd
                .merge(df_new, df_old, how="outer", on=unique_id, indicator=True, suffixes=("_foo",""))
                .query('_merge == "right_only"')
                )
    removed_rows = removed_rows[cols]
    removed_rows[trans_col] = trans_val
    return removed_rows

In [None]:
removed_rows = get_removed_rows(df_old=df1, df_new=df2, unique_id="id")
df_removed = pd.DataFrame({"date":["2013-11-24"],"id":["003"],"quantity":[7],"color":["Red"],"transaction":["removed"]})
test_eq(removed_rows, df_removed)


In [None]:
#export 
def get_modified_rows(df_old, df_new, unique_id, added_rows=None, trans_col="transaction", trans_val="modified"):
    """Returns the modified rows"""
    cols = list(df_new.columns)
    if added_rows is not None: 
        df_new = df_new[~df_new.isin(list(added_rows[unique_id].values))].dropna()
    modified_rows = df_old.merge(df_new, indicator=True, how='outer')
    modified_rows = modified_rows[modified_rows['_merge'] == 'right_only']
    modified_rows = modified_rows[cols]
    modified_rows[trans_col] = trans_val
    return modified_rows

In [None]:
modified_rows = get_modified_rows(df_old=df1, df_new=df2, unique_id="id")

df_modified = pd.DataFrame({"date":["2013-11-25", "2013-11-24", "2013-11-24"],"id":["002", "004", "005"],"quantity":[6,5,10],"color":["Orange", "Red", "Pink"],"transaction":["modified", "modified", "modified"]})

test_eq(modified_rows, df_modified)

In [None]:
modified_rows = get_modified_rows(df_old=df1, df_new=df2, unique_id="id", added_rows=added_rows)

df_modified = pd.DataFrame({"date":["2013-11-25", "2013-11-24"],"id":["002", "004"],"quantity":[6,5],"color":["Orange", "Red"],"transaction":["modified", "modified"]})

test_eq(modified_rows, df_modified)


In [None]:
#export 
def logging(df_log, df_old, df_new, unique_id, trans_col="transaction", trans_val_added="added", trans_val_removed="removed", trans_val_modified="modified", sort_by=None):
    if df_log is None:
        df_log = df_old#.copy()
        df_log[trans_col] = trans_val_added
    else:
        subset = list(df_log.columns)
        subset.remove(trans_col)
        added_rows = get_added_rows(df_old=df_old, df_new=df_new, unique_id=unique_id, trans_col=trans_col, trans_vaL=trans_val_added)
        removed_rows = get_removed_rows(df_old=df_old, df_new=df_new, unique_id=unique_id, trans_col=trans_col, trans_val=trans_val_removed)
        modified_rows = get_modified_rows(df_new=df_new, df_old=df_old, unique_id=unique_id, added_rows=added_rows, trans_col=trans_col, trans_val=trans_val_modified)
        df_log = df_log.append(modified_rows, ignore_index=True)
        df_log = df_log.drop_duplicates(subset=subset, keep="first")
        df_log = df_log.append(added_rows, ignore_index=True)
        df_log = df_log.append(removed_rows, ignore_index=True)  
    if sort_by:
        df_log = df_log.sort_values(by=sort_by)
    return df_log 

In [None]:
df_log = logging(df_log=None, df_old=df1, df_new=df2, unique_id="id")

logged_df = pd.DataFrame({"date":["2013-11-24", "2013-11-24", "2013-11-24", "2013-11-14"],"id":["001", "002", "003", "004"],"quantity":[22,8,7,10],"color":["Yellow", "Orange", "Red", "Yellow"],"transaction":["added", "added", "added", "added"]})

test_eq(df_log, logged_df)

In [None]:
df_log = logging(df_log=df_log, df_old=df1, df_new=df2, unique_id="id")
df_log

Unnamed: 0,date,id,quantity,color,transaction
0,2013-11-24,1,22.0,Yellow,added
1,2013-11-24,2,8.0,Orange,added
2,2013-11-24,3,7.0,Red,added
3,2013-11-24,4,10.0,Yellow,added
4,2013-11-25,2,6.0,Orange,modified
5,2013-11-24,4,5.0,Red,modified
6,2013-11-24,5,10.0,Pink,added
7,2013-11-24,3,7.0,Red,removed


In [None]:
df_log = logging(df_log=df_log, df_old=df1, df_new=df2, unique_id="id", sort_by=["dsate"])
df_log

Unnamed: 0,date,id,quantity,color,transaction
0,2013-11-24,1,22.0,Yellow,added
1,2013-11-24,2,8.0,Orange,added
2,2013-11-24,3,7.0,Red,added
3,2013-11-24,4,10.0,Yellow,added
5,2013-11-24,4,5.0,Red,modified
6,2013-11-24,5,10.0,Pink,added
7,2013-11-24,5,10.0,Pink,added
8,2013-11-24,3,7.0,Red,removed
4,2013-11-25,2,6.0,Orange,modified


In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted 00_core.ipynb.
Converted index.ipynb.
