From 0f223708e9386a696925bc5f9a4fc11b0bd10dc1 Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Mon, 29 Nov 2021 22:08:44 +0100 Subject: [PATCH 1/2] TabularData: Fix `drop_duplicates` for rich_text NaN. When filling missing values with `ui.rich_text` (i.e. in experiments show CMD), those values were not being correctly matched against `self._fill_value`) --- dvc/compare.py | 4 +++- tests/unit/test_tabular_data.py | 40 +++++++++++++++++++++++++++------ 2 files changed, 36 insertions(+), 8 deletions(-) diff --git a/dvc/compare.py b/dvc/compare.py index 3fd782b223..a02f6e1347 100644 --- a/dvc/compare.py +++ b/dvc/compare.py @@ -260,7 +260,9 @@ def drop_duplicates(self, axis: str = "rows"): cols_to_drop: List[str] = [] for n_col, col in enumerate(self.columns): # Cast to str because Text is not hashable error - unique_vals = {str(x) for x in col if x != self._fill_value} + unique_vals = { + str(x) for x in col if str(x) != self._fill_value + } if len(unique_vals) == 1: cols_to_drop.append(self.keys()[n_col]) self.drop(*cols_to_drop) diff --git a/tests/unit/test_tabular_data.py b/tests/unit/test_tabular_data.py index 1963e1edb0..22c8372bad 100644 --- a/tests/unit/test_tabular_data.py +++ b/tests/unit/test_tabular_data.py @@ -226,24 +226,24 @@ def test_dropna(axis, how, data, expected): ( "rows", [ - ["foo", "", ""], - ["foo", "foo", ""], + ["foo", "-", "-"], + ["foo", "foo", "-"], ["foo", "bar", "foobar"], ], ), - ("cols", [[""], ["foo"], ["foo"], ["bar"]]), + ("cols", [["-"], ["foo"], ["foo"], ["bar"]]), ], ) def test_drop_duplicates(axis, expected): - td = TabularData(["col-1", "col-2", "col-3"]) + td = TabularData(["col-1", "col-2", "col-3"], fill_value="-") td.extend( [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]] ) assert list(td) == [ - ["foo", "", ""], - ["foo", "foo", ""], - ["foo", "foo", ""], + ["foo", "-", "-"], + ["foo", "foo", "-"], + ["foo", "foo", "-"], ["foo", "bar", "foobar"], ] @@ -252,6 +252,32 @@ def test_drop_duplicates(axis, expected): assert list(td) == expected +def test_drop_duplicates_rich_text(): + from dvc.ui import ui + + td = TabularData(["col-1", "col-2", "col-3"], fill_value="-") + + td.extend( + [ + ["foo", None, ui.rich_text("-")], + ["foo", "foo"], + ["foo", "foo"], + ["foo", "bar", "foobar"], + ] + ) + + assert list(td) == [ + ["foo", "-", ui.rich_text("-")], + ["foo", "foo", "-"], + ["foo", "foo", "-"], + ["foo", "bar", "foobar"], + ] + + td.drop_duplicates("cols") + + assert list(td) == [["-"], ["foo"], ["foo"], ["bar"]] + + def test_dropna_invalid_axis(): td = TabularData(["col-1", "col-2", "col-3"]) From 251b9317e68250a371ce59417f1afc56932291e6 Mon Sep 17 00:00:00 2001 From: David de la Iglesia Castro Date: Tue, 7 Dec 2021 20:50:11 +0100 Subject: [PATCH 2/2] TabularData: Add `ignore_empty` flag to `drop_duplicates`. Configures whether to consider missing values as relevant or not. --- dvc/command/experiments/show.py | 2 +- dvc/compare.py | 8 +++---- tests/unit/test_tabular_data.py | 42 +++++++++++++++++++++++++++++---- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/dvc/command/experiments/show.py b/dvc/command/experiments/show.py index 3ed78b5a4a..c189065499 100644 --- a/dvc/command/experiments/show.py +++ b/dvc/command/experiments/show.py @@ -473,7 +473,7 @@ def show_experiments( ) if kwargs.get("only_changed", False) or html: - td.drop_duplicates("cols") + td.drop_duplicates("cols", ignore_empty=False) html_args = {} if html: diff --git a/dvc/compare.py b/dvc/compare.py index a02f6e1347..59deb1ff7f 100644 --- a/dvc/compare.py +++ b/dvc/compare.py @@ -249,7 +249,7 @@ def dropna(self, axis: str = "rows", how="any"): else: self.drop(*to_drop) - def drop_duplicates(self, axis: str = "rows"): + def drop_duplicates(self, axis: str = "rows", ignore_empty: bool = True): if axis not in ["rows", "cols"]: raise ValueError( f"Invalid 'axis' value {axis}." @@ -260,9 +260,9 @@ def drop_duplicates(self, axis: str = "rows"): cols_to_drop: List[str] = [] for n_col, col in enumerate(self.columns): # Cast to str because Text is not hashable error - unique_vals = { - str(x) for x in col if str(x) != self._fill_value - } + unique_vals = {str(x) for x in col} + if ignore_empty and self._fill_value in unique_vals: + unique_vals -= {self._fill_value} if len(unique_vals) == 1: cols_to_drop.append(self.keys()[n_col]) self.drop(*cols_to_drop) diff --git a/tests/unit/test_tabular_data.py b/tests/unit/test_tabular_data.py index 22c8372bad..93984b9159 100644 --- a/tests/unit/test_tabular_data.py +++ b/tests/unit/test_tabular_data.py @@ -221,7 +221,7 @@ def test_dropna(axis, how, data, expected): @pytest.mark.parametrize( - "axis,expected", + "axis,expected,ignore_empty", [ ( "rows", @@ -230,11 +230,22 @@ def test_dropna(axis, how, data, expected): ["foo", "foo", "-"], ["foo", "bar", "foobar"], ], + True, + ), + ("cols", [["-"], ["foo"], ["foo"], ["bar"]], True), + ( + "cols", + [ + ["-", "-"], + ["foo", "-"], + ["foo", "-"], + ["bar", "foobar"], + ], + False, ), - ("cols", [["-"], ["foo"], ["foo"], ["bar"]]), ], ) -def test_drop_duplicates(axis, expected): +def test_drop_duplicates(axis, expected, ignore_empty): td = TabularData(["col-1", "col-2", "col-3"], fill_value="-") td.extend( [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]] @@ -247,11 +258,34 @@ def test_drop_duplicates(axis, expected): ["foo", "bar", "foobar"], ] - td.drop_duplicates(axis) + td.drop_duplicates(axis, ignore_empty=ignore_empty) assert list(td) == expected +def test_drop_duplicates_ignore_empty(): + td = TabularData(["col-1", "col-2", "col-3"], fill_value="-") + td.extend( + [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]] + ) + + assert list(td) == [ + ["foo", "-", "-"], + ["foo", "foo", "-"], + ["foo", "foo", "-"], + ["foo", "bar", "foobar"], + ] + + td.drop_duplicates("cols", ignore_empty=False) + + assert list(td) == [ + ["-", "-"], + ["foo", "-"], + ["foo", "-"], + ["bar", "foobar"], + ] + + def test_drop_duplicates_rich_text(): from dvc.ui import ui