new cli for sequence labelling

vecto-ai · Jan 15, 2019 · bc98647 · bc98647
1 parent 21d0927
commit bc98647
Show file tree

Hide file tree

Showing 6 changed files with 59 additions and 48 deletions.
diff --git a/tests/benchmarks/test_sequence_labeling.py b/tests/benchmarks/test_sequence_labeling.py
@@ -9,6 +9,7 @@
 from vecto.embeddings import load_from_dir
 from tests.test_setup import run_module
 
+
 path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
 path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner)
 path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
@@ -28,25 +29,32 @@ def test_api(self):
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.sequence_labeling",
+            run_module("vecto",
+                       "benchmark",
+                       "sequence_labelling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.sequence_labeling",
+            run_module("vecto",
+                       "benchmark",
+                       "sequence_labelling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         with self.assertRaises(FileNotFoundError):
             sio = io.StringIO()
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.sequence_labeling",
+                run_module("vecto",
+                           "benchmark",
+                           "sequence_labelling",
                            path_emb + "NONEXISTING",
                            path_sequence_labeling_dataset_ner,
-                           "--path_out", "/tmp/vecto/benchmarks/")
+                           "--path_out",
+                           "/tmp/vecto/benchmarks/")
 
         from matplotlib import pyplot as plt
         # here the visualization only for the ner sub task.

diff --git a/tests/benchmarks/test_similarity.py b/tests/benchmarks/test_similarity.py
@@ -9,6 +9,7 @@
 from vecto.embeddings import load_from_dir
 from tests.test_setup import run_module
 
+
 path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
 path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
 
@@ -35,22 +36,28 @@ def test_api(self):
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.similarity",
+            run_module("vecto",
+                       "benchmark",
+                       "similarity",
                        path_emb,
                        path_similarity_dataset,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.similarity",
+            run_module("vecto",
+                       "benchmark",
+                       "similarity",
                        path_emb,
                        path_similarity_dataset,
                        "--path_out", "/tmp/vecto/benchmarks/tmp")
 
         with self.assertRaises(FileNotFoundError):
             sio = io.StringIO()
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.similarity",
+                run_module("vecto",
+                           "benchmark",
+                           "similarity",
                            path_emb + "NONEXISTING",
                            path_similarity_dataset,
                            "--path_out", "/tmp/vecto/benchmarks/")

diff --git a/vecto/benchmarks/__init__.py b/vecto/benchmarks/__init__.py
@@ -44,8 +44,14 @@ def _run(args=None):
         print("running similarity")
         from .similarity import run
         run(options, unknownargs)
+    elif args.name == "sequence_labelling":
+        print("running sequence labelling")
+        from .sequence_labeling import run
+        run(options, unknownargs)
     else:
+        print("unknown benchmark name")
         list_benhcmarks()
+        exit(-1)
     # check if all is specified - then run all
     # if benchmark name matches - run corresponding module
     # list all available benchmarks
diff --git a/vecto/benchmarks/sequence_labeling/__init__.py b/vecto/benchmarks/sequence_labeling/__init__.py
@@ -1 +1,31 @@
-from .sequence_labeling import Sequence_labeling
+import argparse
+import os
+from vecto.embeddings import load_from_dir
+from vecto.utils.data import save_json, print_json
+from .sequence_labeling import Sequence_labeling
+
+
+def run(options, extra_args):
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("embeddings")
+    parser.add_argument("dataset")
+    parser.add_argument("--window_size", default=5, type=int)
+    parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'],
+                        help='name of method')
+    parser.add_argument('--normalize', dest='normalize', action='store_true')
+    parser.add_argument("--path_out", default=False, help="destination folder to save results")
+    args = parser.parse_args(extra_args)
+    embeddings = load_from_dir(args.embeddings)
+    # print("embeddings:", embeddings)
+    sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size)
+    results = sequence_labeling.get_result(embeddings, args.dataset)
+    if args.path_out:
+        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
+            dataset = os.path.basename(os.path.normpath(args.dataset))
+            name_file_out = os.path.join(args.path_out, dataset, "results.json")
+            save_json(results, name_file_out)
+        else:
+            save_json(results, args.path_out)
+    else:
+        print_json(results)
diff --git a/vecto/benchmarks/sequence_labeling/__main__.py b/vecto/benchmarks/sequence_labeling/__main__.py
@@ -1,45 +1,6 @@
-import argparse
-import json
-import logging
-import os
 
 from vecto.utils.data import save_json
 from vecto.benchmarks.sequence_labeling import Sequence_labeling
 from vecto.embeddings import load_from_dir
 
-logging.basicConfig(level=logging.DEBUG)
 
-
-def print_json(data):
-    print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False))
-
-
-def main():
-    # config = load_config()
-    # print(config)
-    parser = argparse.ArgumentParser()
-    parser.add_argument("embeddings")
-    parser.add_argument("dataset")
-    parser.add_argument("--window_size", default=5, type=int)
-    parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'],
-                        help='name of method')
-    parser.add_argument('--normalize', dest='normalize', action='store_true')
-    parser.add_argument("--path_out", default=False, help="destination folder to save results")
-    args = parser.parse_args()
-    embeddings = load_from_dir(args.embeddings)
-    # print("embeddings", embeddings)
-    sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size)
-    results = sequence_labeling.get_result(embeddings, args.dataset)
-    if args.path_out:
-        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
-            dataset = os.path.basename(os.path.normpath(args.dataset))
-            name_file_out = os.path.join(args.path_out, dataset, "results.json")
-            save_json(results, name_file_out)
-        else:
-            save_json(results, args.path_out)
-    else:
-        print_json(results)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/vecto/benchmarks/visualize.py b/vecto/benchmarks/visualize.py
@@ -24,7 +24,6 @@ def df_from_file(path):
         logger.warning(f"default_measurement not specified in {path}")
     dframe["result"] = dframe["result." + default_measurement]
     # df["reciprocal_rank"] = 1 / (df["rank"] + 1)
-    print(dframe)
     return dframe