Merge 20bb550 into fc7a512

vecto-ai · Jan 16, 2019 · 1a414f7 · 1a414f7
2 parents fc7a512 + 20bb550
commit 1a414f7
Show file tree

Hide file tree

Showing 26 changed files with 744 additions and 599 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,8 @@
+# project-specific
+
+_autosummary
+
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/tests/benchmarks/test_analogy.py b/tests/benchmarks/test_analogy.py
@@ -4,7 +4,7 @@
 import unittest
 import io
 from os import path
-from vecto.benchmarks.analogy import *
+from vecto.benchmarks.analogy import Analogy
 from vecto.benchmarks import visualize
 from vecto.embeddings import load_from_dir
 from ..test_setup import run_module
@@ -17,55 +17,59 @@ class Tests(unittest.TestCase):
 
     def test_api(self):
         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
-        analogy = LinearOffset()
+        analogy = Analogy(method="3CosAdd")
         result = analogy.get_result(embs, path_analogy_dataset)
         self.assertIsInstance(result[0], dict)
 
-        analogy = PairDistance()
+        analogy = Analogy(method="PairDistance")
         result = analogy.get_result(embs, path_analogy_dataset)
         self.assertIsInstance(result[0], dict)
 
-        analogy = ThreeCosMul()
+        analogy = Analogy(method="3CosMul")
         result = analogy.get_result(embs, path_analogy_dataset)
         self.assertIsInstance(result[0], dict)
 
-        analogy = ThreeCosMul2()
+        analogy = Analogy(method="3CosMul2")
         result = analogy.get_result(embs, path_analogy_dataset)
         self.assertIsInstance(result[0], dict)
 
-        analogy = ThreeCosAvg()
+        analogy = Analogy(method="3CosAvg")
         result = analogy.get_result(embs, path_analogy_dataset)
         self.assertIsInstance(result[0], dict)
 
-        # analogy = SimilarToAny()
-        # result = analogy.get_result(embs, path_analogy_dataset)
-        # print(result)
-        # analogy = SimilarToB()
-        # result = analogy.get_result(embs, path_analogy_dataset)
-        # print(result)
-        analogy = LRCos()
+        analogy = Analogy(method="SimilarToAny")
+        result = analogy.get_result(embs, path_analogy_dataset)
+        print(result)
+
+        analogy = Analogy(method="SimilarToB")
+        result = analogy.get_result(embs, path_analogy_dataset)
+        print(result)
+
+        analogy = Analogy(method="LRCos")
         result = analogy.get_result(embs, path_analogy_dataset)
         print(result)
 
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.analogy",
+            run_module("vecto", "benchmark", "analogy",
                        "./tests/data/embeddings/text/plain_with_file_header/",
                        "./tests/data/benchmarks/analogy/",
-                       "--path_out", "/tmp/vecto/benchmarks/", "--method", "3CosAdd")
+                       "--path_out", "/tmp/vecto/benchmarks/",
+                       "--method", "3CosAdd")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.analogy",
+            run_module("vecto", "benchmark", "analogy",
                        "./tests/data/embeddings/text/plain_with_file_header/",
                        "./tests/data/benchmarks/analogy/",
-                       "--path_out", "/tmp/vecto/benchmarks/specific_filename.json",
+                       "--path_out",
+                       "/tmp/vecto/benchmarks/specific_filename.json",
                        "--method", "LRCos")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.analogy",
+            run_module("vecto", "benchmark", "analogy",
                        "./tests/data/embeddings/text/plain_with_file_header/",
                        "./tests/data/benchmarks/analogy/",
                        "--path_out", "/tmp/vecto/benchmarks/",
@@ -74,18 +78,19 @@ def test_cli(self):
         sio = io.StringIO()
         with self.assertRaises(RuntimeError):
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.analogy",
+                run_module("vecto", "benchmark", "analogy",
                            "./tests/data/embeddings/text/plain_with_file_header/",
                            "./tests/data/benchmarks/analogy/",
                            "--method", "NONEXISTING")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.analogy",
+            run_module("vecto", "benchmark", "analogy",
                        "./tests/data/embeddings/text/plain_with_file_header/",
                        "./tests/data/benchmarks/analogy/",
                        "--method", "3CosAvg")
 
+        # TODO: suppress concatenating timestamp or aggregate multiple runs
         from matplotlib import pyplot as plt
-        visualize.plot_accuracy("/tmp/vecto/benchmarks/analogy")
+        visualize.plot_accuracy("/tmp/vecto/benchmarks/analogical_reasoning")
         plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight")
diff --git a/tests/benchmarks/test_categorization.py b/tests/benchmarks/test_categorization.py
@@ -25,10 +25,13 @@ def test_categorization_method_works(self):
     def test_cli(self):
         sio = StringIO()
         with redirect_stdout(sio):
-            run_module('vecto.benchmarks.categorization',
+            run_module('vecto',
+                       'benchmark',
+                       'categorization',
                        './tests/data/embeddings/text/plain_with_file_header/',
                        './tests/data/benchmarks/categorization/',
-                       '--path_out', '/tmp/vecto/benchmarks', '--method', 'KMeansCategorization')
+                       '--path_out', '/tmp/vecto/benchmarks',
+                       '--method', 'KMeansCategorization')
 
     def test_categorization_scores(self):
         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))

diff --git a/tests/benchmarks/test_sequence_labeling.py b/tests/benchmarks/test_sequence_labeling.py
@@ -9,6 +9,7 @@
 from vecto.embeddings import load_from_dir
 from tests.test_setup import run_module
 
+
 path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
 path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner)
 path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
@@ -28,25 +29,32 @@ def test_api(self):
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.sequence_labeling",
+            run_module("vecto",
+                       "benchmark",
+                       "sequence_labelling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.sequence_labeling",
+            run_module("vecto",
+                       "benchmark",
+                       "sequence_labelling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         with self.assertRaises(FileNotFoundError):
             sio = io.StringIO()
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.sequence_labeling",
+                run_module("vecto",
+                           "benchmark",
+                           "sequence_labelling",
                            path_emb + "NONEXISTING",
                            path_sequence_labeling_dataset_ner,
-                           "--path_out", "/tmp/vecto/benchmarks/")
+                           "--path_out",
+                           "/tmp/vecto/benchmarks/")
 
         from matplotlib import pyplot as plt
         # here the visualization only for the ner sub task.

diff --git a/tests/benchmarks/test_similarity.py b/tests/benchmarks/test_similarity.py
@@ -9,6 +9,7 @@
 from vecto.embeddings import load_from_dir
 from tests.test_setup import run_module
 
+
 path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
 path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
 
@@ -35,22 +36,28 @@ def test_api(self):
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.similarity",
+            run_module("vecto",
+                       "benchmark",
+                       "similarity",
                        path_emb,
                        path_similarity_dataset,
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.similarity",
+            run_module("vecto",
+                       "benchmark",
+                       "similarity",
                        path_emb,
                        path_similarity_dataset,
                        "--path_out", "/tmp/vecto/benchmarks/tmp")
 
         with self.assertRaises(FileNotFoundError):
             sio = io.StringIO()
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.similarity",
+                run_module("vecto",
+                           "benchmark",
+                           "similarity",
                            path_emb + "NONEXISTING",
                            path_similarity_dataset,
                            "--path_out", "/tmp/vecto/benchmarks/")

diff --git a/tests/test_setup.py b/tests/test_setup.py
@@ -34,6 +34,7 @@ def run_pip(*args, **kwargs):
     run_program(pip_exec_name, *args, **kwargs)
 
 
+# TODO: move this to helper module
 def run_module(name: str, *args, run_name: str = '__main__') -> None:
     backup_sys_argv = sys.argv
     sys.argv = [name + '.py'] + list(args)

diff --git a/vecto/__main__.py b/vecto/__main__.py
@@ -0,0 +1,4 @@
+from .cli import CLI
+
+if __name__ == "__main__":
+    CLI()
diff --git a/vecto/_version.py b/vecto/_version.py
@@ -1,3 +1,3 @@
 """Version of vecto package."""
 
-VERSION = "0.1.7"
+VERSION = "0.1.8"
diff --git a/vecto/benchmarks/__init__.py b/vecto/benchmarks/__init__.py
@@ -6,3 +6,52 @@
     analogy
 
 """
+
+import argparse
+import importlib
+
+
+def list_benhcmarks():
+    print("available benchmarks:")
+    # TODO: list benchmarks
+
+
+def _run(args=None):
+    parser = argparse.ArgumentParser(
+        description='run benchmarks',
+        add_help=True,
+        usage="vecto benchmark [name]")
+
+    parser.add_argument('name', help='Subcommand to run')
+    args, unknownargs = parser.parse_known_args(args)
+    if args.name == "help":
+        list_benhcmarks()
+        return
+    # if args.name == "all":
+        # print("running all benchmarks")
+
+    options = {}
+
+    if args.name == "analogy":
+        print("running analogy")
+        from .analogy import run
+        run(unknownargs)
+    elif args.name == "categorization":
+        print("running categorization")
+        from .categorization import run
+        run(options, unknownargs)
+    elif args.name == "similarity":
+        print("running similarity")
+        from .similarity import run
+        run(options, unknownargs)
+    elif args.name == "sequence_labelling":
+        print("running sequence labelling")
+        from .sequence_labeling import run
+        run(options, unknownargs)
+    else:
+        print("unknown benchmark name")
+        list_benhcmarks()
+        exit(-1)
+    # check if all is specified - then run all
+    # if benchmark name matches - run corresponding module
+    # list all available benchmarks
diff --git a/vecto/benchmarks/analogy/__init__.py b/vecto/benchmarks/analogy/__init__.py
@@ -1 +1,41 @@
-from .analogy import ThreeCosAvg, ThreeCosMul, ThreeCosMul2, LinearOffset, LRCos, PairDistance
+import argparse
+import logging
+import os
+from .analogy import Analogy
+from vecto.utils.data import save_json, print_json
+# from vecto.config import load_config
+from vecto.embeddings import load_from_dir
+from vecto.utils import get_time_str
+
+logging.basicConfig(level=logging.DEBUG)
+
+
+def run(args):
+    # config = load_config()
+    # print(config)
+    print(args)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("embeddings")
+    parser.add_argument("dataset")
+    parser.add_argument("--method", help="analogy solving method", default="LRCos")
+    parser.add_argument("--path_out", help="destination folder to save results")
+    args = parser.parse_args(args)
+    embeddings = load_from_dir(args.embeddings)
+    # print("embeddings", embeddings)
+    benchmark = Analogy(method=args.method)
+    results = benchmark.get_result(embeddings, args.dataset)
+    if args.path_out:
+        if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
+            dataset = os.path.basename(os.path.normpath(args.dataset))
+            timestamp = get_time_str()
+            name_file_out = os.path.join(args.path_out,
+                                         "analogical_reasoning",
+                                         dataset,
+                                         args.method,
+                                         timestamp,
+                                         "results.json")
+            save_json(results, name_file_out)
+        else:
+            save_json(results, args.path_out)
+    else:
+        print_json(results)
diff --git a/vecto/benchmarks/analogy/__main__.py b/vecto/benchmarks/analogy/__main__.py