Skip to content

Commit

Permalink
feat: add data_profile to diff objects
Browse files Browse the repository at this point in the history
  • Loading branch information
pckhoi committed Jan 17, 2022
1 parent ccde39f commit 13458a4
Show file tree
Hide file tree
Showing 7 changed files with 42 additions and 5 deletions.
2 changes: 1 addition & 1 deletion doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
author = 'Khoi Pham'

# The full version, including alpha/beta/rc tags
release = '0.7.6.1'
release = '0.8.3'


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = wrgl
version = 0.7.6.1
version = 0.8.3
author = Khoi Pham
author_email = pckhoi@gmail.com
description = Data matching utilities
Expand Down
30 changes: 30 additions & 0 deletions wrgl/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,34 @@ class RowDiff(object):
off2: int


@attr.s(auto_attribs=True, field_transformer=field_transformer(globals()))
class ColumnProfileDiff(object):
"""Changes in column profile.
:ivar str name: column name
:ivar bool new_addition: is this a newly added column
:ivar bool removed: is this a removed column
:ivar list[dict] stats: list of changes in statistics
"""
name: str
new_addition: bool
removed: bool
stats: typing.List[typing.Dict]


@attr.s(auto_attribs=True, field_transformer=field_transformer(globals()))
class TableProfileDiff(object):
"""Changes in table profile.
:ivar int old_rows_count: rows count in old table
:ivar int new_rows_count: rows count in new table
:ivar list[ColumnProfileDiff] columns: list of changes in column profile
"""
old_rows_count: int
new_rows_count: int
columns: typing.List[ColumnProfileDiff]


@attr.s(auto_attribs=True, field_transformer=field_transformer(globals()))
class DiffResult(object):
"""Diff result. Learn more at `diff endpoint`_
Expand All @@ -29,6 +57,7 @@ class DiffResult(object):
:ivar list[str] columns: list of column names of the first table
:ivar list[str] old_columns: list of column names of the second table
:ivar list[RowDiff] row_diff: list of rows that changed
:ivar list[TableProfileDiff] data_profile: changes in data profile
"""
table_sum: str
old_table_sum: str
Expand All @@ -37,6 +66,7 @@ class DiffResult(object):
old_columns: typing.List[str]
columns: typing.List[str]
row_diff: typing.List[RowDiff]
data_profile: TableProfileDiff

@property
def primary_key(self) -> typing.List[str]:
Expand Down
5 changes: 4 additions & 1 deletion wrgl/diffreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from wrgl import repository
from wrgl.commit import Table
from wrgl.diff import DiffResult
from wrgl.diff import DiffResult, TableProfileDiff
from wrgl.coldiff import ColDiff


Expand Down Expand Up @@ -208,13 +208,15 @@ class DiffReader(object):
:var RowIterator added_rows: iterator for added rows
:var RowIterator removed_rows: iterator for removed rows
:var ModifiedRowIterator modified_rows: iterator for modified rows
:var TableProfileDiff data_profile: changes in data profile
"""

column_changes: ColumnChanges
pk_changes: ColumnChanges
added_rows: RowIterator or None = None
removed_rows: RowIterator or None = None
modified_rows: ModifiedRowIterator or None = None
data_profile: TableProfileDiff or None = None

def __init__(self, repo: "repository.Repository", com_sum1: str, com_sum2: str, fetch_size: int = 100) -> None:
"""
Expand All @@ -223,6 +225,7 @@ def __init__(self, repo: "repository.Repository", com_sum1: str, com_sum2: str,
:param str com_sum2: checksum of the second (older) commit
"""
dr = repo.diff(com_sum1, com_sum2)
self.data_profile = dr.data_profile
old_tbl = Table(columns=dr.old_columns, pk=dr.old_pk)
new_tbl = Table(columns=dr.columns, pk=dr.pk)
cd = ColDiff(old_tbl, new_tbl)
Expand Down
4 changes: 4 additions & 0 deletions wrgl/repository_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ def test_diff_reader(self):
[('1', '1'), ('q', 'q'), ('u', 'w'), (None, 'e'), ('r', None)],
[('2', '2'), ('a', 'a'), ('s', 's'), (None, 'd'), ('f', None)]
])
self.assertIsNotNone(dr.data_profile)
self.assertEqual(dr.data_profile.old_rows_count, 3)
self.assertEqual(dr.data_profile.new_rows_count, 3)
self.assertEqual(len(dr.data_profile.columns), 5)

def test_diff_reader_no_changes(self):
with self.commit("main", "initial commit", ["a"]) as writer:
Expand Down
2 changes: 1 addition & 1 deletion wrgl/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _deserialize(data, serializer_cls):
]
data_stack = [
(kwargs, k, fields_dict[k], v)
for k, v in data if k != 'meta'
for k, v in data if k in fields_dict and k != 'meta'
]
while len(data_stack) > 0:
parent, name, field, value = data_stack.pop()
Expand Down
2 changes: 1 addition & 1 deletion wrgl/serialize_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_loads_simple(self):
def test_loads_with_null(self):
self.assertEqual(
json_loads(
'{"user": null, "remotes": null, "branch": null}', Config
'{"user": null, "remotes": null, "branch": null, "nonExistentKey": null}', Config
),
Config()
)
Expand Down

0 comments on commit 13458a4

Please sign in to comment.