diff --git a/tests/benchmarks/test_analogy.py b/tests/benchmarks/test_analogy.py
index 38af7f36..78bee6e6 100644
--- a/tests/benchmarks/test_analogy.py
+++ b/tests/benchmarks/test_analogy.py
@@ -92,5 +92,5 @@ def test_cli(self):
 
         # TODO: suppress concatenating timestamp or aggregate multiple runs
         from matplotlib import pyplot as plt
-        visualize.plot_accuracy("/tmp/vecto/benchmarks/analogical_reasoning")
+        visualize.plot_accuracy("/tmp/vecto/benchmarks/word_analogy")
         plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight")
diff --git a/tests/benchmarks/test_sequence_labeling.py b/tests/benchmarks/test_sequence_labeling.py
index 8f062409..9f5c71d1 100644
--- a/tests/benchmarks/test_sequence_labeling.py
+++ b/tests/benchmarks/test_sequence_labeling.py
@@ -58,5 +58,5 @@ def test_cli(self):
 
         from matplotlib import pyplot as plt
         # here the visualization only for the ner sub task.
-        visualize.plot_accuracy("/tmp/vecto/benchmarks/ner", key_secondary="experiment_setup.dataset")
+        visualize.plot_accuracy("/tmp/vecto/benchmarks/sequence_labeling/ner", key_secondary="experiment_setup.dataset")
         plt.savefig("/tmp/vecto/benchmarks/sequence_labeling.pdf", bbox_inches="tight")
diff --git a/vecto/benchmarks/base.py b/vecto/benchmarks/base.py
index 5b60fde5..5e1f43e5 100644
--- a/vecto/benchmarks/base.py
+++ b/vecto/benchmarks/base.py
@@ -25,7 +25,9 @@ def run_with_args(self, args):
             if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
                 dataset = os.path.basename(os.path.normpath(args.dataset))
                 timestamp = get_time_str()
+                task = results[0]["experiment_setup"]["task"]
                 name_file_out = os.path.join(args.path_out,
+                                             task,
                                              dataset,
                                              timestamp,
                                              "results.json")