From 0f223708e9386a696925bc5f9a4fc11b0bd10dc1 Mon Sep 17 00:00:00 2001
From: David de la Iglesia Castro <daviddelaiglesiacastro@gmail.com>
Date: Mon, 29 Nov 2021 22:08:44 +0100
Subject: [PATCH 1/2] TabularData: Fix `drop_duplicates` for rich_text NaN.

When filling missing values with `ui.rich_text` (i.e. in experiments show CMD), those values were not being correctly matched against `self._fill_value`)
---
 dvc/compare.py                  |  4 +++-
 tests/unit/test_tabular_data.py | 40 +++++++++++++++++++++++++++------
 2 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/dvc/compare.py b/dvc/compare.py
index 3fd782b223..a02f6e1347 100644
--- a/dvc/compare.py
+++ b/dvc/compare.py
@@ -260,7 +260,9 @@ def drop_duplicates(self, axis: str = "rows"):
             cols_to_drop: List[str] = []
             for n_col, col in enumerate(self.columns):
                 # Cast to str because Text is not hashable error
-                unique_vals = {str(x) for x in col if x != self._fill_value}
+                unique_vals = {
+                    str(x) for x in col if str(x) != self._fill_value
+                }
                 if len(unique_vals) == 1:
                     cols_to_drop.append(self.keys()[n_col])
             self.drop(*cols_to_drop)
diff --git a/tests/unit/test_tabular_data.py b/tests/unit/test_tabular_data.py
index 1963e1edb0..22c8372bad 100644
--- a/tests/unit/test_tabular_data.py
+++ b/tests/unit/test_tabular_data.py
@@ -226,24 +226,24 @@ def test_dropna(axis, how, data, expected):
         (
             "rows",
             [
-                ["foo", "", ""],
-                ["foo", "foo", ""],
+                ["foo", "-", "-"],
+                ["foo", "foo", "-"],
                 ["foo", "bar", "foobar"],
             ],
         ),
-        ("cols", [[""], ["foo"], ["foo"], ["bar"]]),
+        ("cols", [["-"], ["foo"], ["foo"], ["bar"]]),
     ],
 )
 def test_drop_duplicates(axis, expected):
-    td = TabularData(["col-1", "col-2", "col-3"])
+    td = TabularData(["col-1", "col-2", "col-3"], fill_value="-")
     td.extend(
         [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]]
     )
 
     assert list(td) == [
-        ["foo", "", ""],
-        ["foo", "foo", ""],
-        ["foo", "foo", ""],
+        ["foo", "-", "-"],
+        ["foo", "foo", "-"],
+        ["foo", "foo", "-"],
         ["foo", "bar", "foobar"],
     ]
 
@@ -252,6 +252,32 @@ def test_drop_duplicates(axis, expected):
     assert list(td) == expected
 
 
+def test_drop_duplicates_rich_text():
+    from dvc.ui import ui
+
+    td = TabularData(["col-1", "col-2", "col-3"], fill_value="-")
+
+    td.extend(
+        [
+            ["foo", None, ui.rich_text("-")],
+            ["foo", "foo"],
+            ["foo", "foo"],
+            ["foo", "bar", "foobar"],
+        ]
+    )
+
+    assert list(td) == [
+        ["foo", "-", ui.rich_text("-")],
+        ["foo", "foo", "-"],
+        ["foo", "foo", "-"],
+        ["foo", "bar", "foobar"],
+    ]
+
+    td.drop_duplicates("cols")
+
+    assert list(td) == [["-"], ["foo"], ["foo"], ["bar"]]
+
+
 def test_dropna_invalid_axis():
     td = TabularData(["col-1", "col-2", "col-3"])
 

From 251b9317e68250a371ce59417f1afc56932291e6 Mon Sep 17 00:00:00 2001
From: David de la Iglesia Castro <daviddelaiglesiacastro@gmail.com>
Date: Tue, 7 Dec 2021 20:50:11 +0100
Subject: [PATCH 2/2] TabularData: Add `ignore_empty` flag to
 `drop_duplicates`.

Configures whether to consider missing values as relevant or not.
---
 dvc/command/experiments/show.py |  2 +-
 dvc/compare.py                  |  8 +++----
 tests/unit/test_tabular_data.py | 42 +++++++++++++++++++++++++++++----
 3 files changed, 43 insertions(+), 9 deletions(-)

diff --git a/dvc/command/experiments/show.py b/dvc/command/experiments/show.py
index 3ed78b5a4a..c189065499 100644
--- a/dvc/command/experiments/show.py
+++ b/dvc/command/experiments/show.py
@@ -473,7 +473,7 @@ def show_experiments(
     )
 
     if kwargs.get("only_changed", False) or html:
-        td.drop_duplicates("cols")
+        td.drop_duplicates("cols", ignore_empty=False)
 
     html_args = {}
     if html:
diff --git a/dvc/compare.py b/dvc/compare.py
index a02f6e1347..59deb1ff7f 100644
--- a/dvc/compare.py
+++ b/dvc/compare.py
@@ -249,7 +249,7 @@ def dropna(self, axis: str = "rows", how="any"):
         else:
             self.drop(*to_drop)
 
-    def drop_duplicates(self, axis: str = "rows"):
+    def drop_duplicates(self, axis: str = "rows", ignore_empty: bool = True):
         if axis not in ["rows", "cols"]:
             raise ValueError(
                 f"Invalid 'axis' value {axis}."
@@ -260,9 +260,9 @@ def drop_duplicates(self, axis: str = "rows"):
             cols_to_drop: List[str] = []
             for n_col, col in enumerate(self.columns):
                 # Cast to str because Text is not hashable error
-                unique_vals = {
-                    str(x) for x in col if str(x) != self._fill_value
-                }
+                unique_vals = {str(x) for x in col}
+                if ignore_empty and self._fill_value in unique_vals:
+                    unique_vals -= {self._fill_value}
                 if len(unique_vals) == 1:
                     cols_to_drop.append(self.keys()[n_col])
             self.drop(*cols_to_drop)
diff --git a/tests/unit/test_tabular_data.py b/tests/unit/test_tabular_data.py
index 22c8372bad..93984b9159 100644
--- a/tests/unit/test_tabular_data.py
+++ b/tests/unit/test_tabular_data.py
@@ -221,7 +221,7 @@ def test_dropna(axis, how, data, expected):
 
 
 @pytest.mark.parametrize(
-    "axis,expected",
+    "axis,expected,ignore_empty",
     [
         (
             "rows",
@@ -230,11 +230,22 @@ def test_dropna(axis, how, data, expected):
                 ["foo", "foo", "-"],
                 ["foo", "bar", "foobar"],
             ],
+            True,
+        ),
+        ("cols", [["-"], ["foo"], ["foo"], ["bar"]], True),
+        (
+            "cols",
+            [
+                ["-", "-"],
+                ["foo", "-"],
+                ["foo", "-"],
+                ["bar", "foobar"],
+            ],
+            False,
         ),
-        ("cols", [["-"], ["foo"], ["foo"], ["bar"]]),
     ],
 )
-def test_drop_duplicates(axis, expected):
+def test_drop_duplicates(axis, expected, ignore_empty):
     td = TabularData(["col-1", "col-2", "col-3"], fill_value="-")
     td.extend(
         [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]]
@@ -247,11 +258,34 @@ def test_drop_duplicates(axis, expected):
         ["foo", "bar", "foobar"],
     ]
 
-    td.drop_duplicates(axis)
+    td.drop_duplicates(axis, ignore_empty=ignore_empty)
 
     assert list(td) == expected
 
 
+def test_drop_duplicates_ignore_empty():
+    td = TabularData(["col-1", "col-2", "col-3"], fill_value="-")
+    td.extend(
+        [["foo"], ["foo", "foo"], ["foo", "foo"], ["foo", "bar", "foobar"]]
+    )
+
+    assert list(td) == [
+        ["foo", "-", "-"],
+        ["foo", "foo", "-"],
+        ["foo", "foo", "-"],
+        ["foo", "bar", "foobar"],
+    ]
+
+    td.drop_duplicates("cols", ignore_empty=False)
+
+    assert list(td) == [
+        ["-", "-"],
+        ["foo", "-"],
+        ["foo", "-"],
+        ["bar", "foobar"],
+    ]
+
+
 def test_drop_duplicates_rich_text():
     from dvc.ui import ui