Merge 3ea5bed into 373c017

vecto-ai · Jan 23, 2019 · 1731a22 · 1731a22
2 parents 373c017 + 3ea5bed
commit 1731a22
Show file tree

Hide file tree

Showing 29 changed files with 488 additions and 386 deletions.
diff --git a/tests/benchmarks/test_categorization.py b/tests/benchmarks/test_categorization.py
@@ -32,6 +32,14 @@ def test_cli(self):
                        './tests/data/benchmarks/categorization/',
                        '--path_out', '/tmp/vecto/benchmarks',
                        '--method', 'KMeansCategorization')
+        # with redirect_stdout(sio):
+        #     run_module('vecto',
+        #                'benchmark',
+        #                'categorization',
+        #                './tests/data/embeddings/text/plain_with_file_header/',
+        #                './tests/data/benchmarks/categorization/',
+        #                '--path_out', '/tmp/vecto/benchmarks',
+        #                '--method', 'SpectralCategorization')
 
     def test_categorization_scores(self):
         embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))

diff --git a/tests/benchmarks/test_language_modeling.py b/tests/benchmarks/test_language_modeling.py
@@ -39,28 +39,36 @@ def test_api(self):
     def test_cli(self):
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.language_modeling",
+            run_module("vecto",
+                       "benchmark",
+                       "language_modeling",
                        path_emb,
                        "--window_size", "5",
                        "--path_out", "/tmp/vecto/benchmarks/")
 
         sio = io.StringIO()
         with contextlib.redirect_stdout(sio):
-            run_module("vecto.benchmarks.language_modeling",
+            run_module("vecto",
+                       "benchmark",
+                       "language_modeling",
                        path_emb,
                        "--method", "lr",
                        "--path_out", "/tmp/vecto/benchmarks/tmp")
 
         with self.assertRaises(FileNotFoundError):
             sio = io.StringIO()
             with contextlib.redirect_stdout(sio):
-                run_module("vecto.benchmarks.language_modeling",
+                run_module("vecto",
+                           "benchmark",
+                           "language_modeling",
                            path_emb + "NONEXISTING",
                            "--path_out", "/tmp/vecto/benchmarks/")
 
         from matplotlib import pyplot as plt
-        visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling", key_secondary="experiment_setup.dataset")
-        plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf", bbox_inches="tight")
+        visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling",
+                                key_secondary="experiment_setup.dataset")
+        plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf",
+                    bbox_inches="tight")
 
 
-Tests().test_cli()
+# Tests().test_cli()
diff --git a/tests/benchmarks/test_misc.py b/tests/benchmarks/test_misc.py
@@ -14,7 +14,7 @@
 # from vecto.benchmarks.similarity import visualize as similarity_visualize
 from vecto.benchmarks.text_classification import Text_classification
 from vecto.embeddings import load_from_dir
-from vecto.utils.fetch_benchmarks import fetch_benchmarks
+# from vecto.utils.fetch_benchmarks import fetch_benchmarks
 from os import path
 
 # from shutil import rmtree
@@ -26,14 +26,14 @@
 
 class Tests(unittest.TestCase):
 
-    def test_fetcher(self):
-        if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
-            return
-        fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
-        embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
-        similarity = Similarity()
-        path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
-        similarity.get_result(embs, path_similarity_dataset)
+    # def test_fetcher(self):
+    #     if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
+    #         return
+    #     fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
+    #     embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
+    #     similarity = Similarity()
+    #     path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
+    #     similarity.get_result(embs, path_similarity_dataset)
 
     def test_abc(self):
         with self.assertRaises(NotImplementedError):

diff --git a/tests/benchmarks/test_relation_extraction.py b/tests/benchmarks/test_relation_extraction.py
@@ -0,0 +1,50 @@
+"""Tests for analogy benchmark."""
+
+import contextlib
+import unittest
+import io
+from os import path
+from vecto.benchmarks import visualize
+from vecto.embeddings import load_from_dir
+from tests.test_setup import run_module
+
+
+path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
+path_dataset = path.join('tests', 'data', 'benchmarks', 'relation_extraction')
+
+
+class Tests(unittest.TestCase):
+    # def test_api(self):
+    #     embs = load_from_dir(path_emb)
+
+    #     for method in ['lr', '2FFNN']:
+    #         sequence_labeling = Sequence_labeling(method=method)
+    #         for subtask in ['chunk', 'pos', 'ner']:  # , 'chunk', 'pos', 'ner'
+    #             result = sequence_labeling.get_result(embs, path.join(path_sequence_labeling_dataset, subtask))
+    #             self.assertIsInstance(result[0], dict)
+    #             print(result)
+
+    def test_cli(self):
+        sio = io.StringIO()
+        with contextlib.redirect_stdout(sio):
+            run_module("vecto",
+                       "benchmark",
+                       "relation_extraction",
+                       path_emb,
+                       path_dataset,
+                       "--path_out", "/tmp/vecto/benchmarks/")
+
+        with self.assertRaises(FileNotFoundError):
+            sio = io.StringIO()
+            with contextlib.redirect_stdout(sio):
+                run_module("vecto",
+                           "benchmark",
+                           "relation_extraction",
+                           path_emb + "NONEXISTING",
+                           path_dataset,
+                           "--path_out",
+                           "/tmp/vecto/benchmarks/")
+
+        from matplotlib import pyplot as plt
+        visualize.plot_accuracy("/tmp/vecto/benchmarks/relation_extraction", key_secondary="experiment_setup.dataset")
+        plt.savefig("/tmp/vecto/benchmarks/relation_extraction.pdf", bbox_inches="tight")
diff --git a/tests/benchmarks/test_sequence_labeling.py b/tests/benchmarks/test_sequence_labeling.py
@@ -31,7 +31,7 @@ def test_cli(self):
         with contextlib.redirect_stdout(sio):
             run_module("vecto",
                        "benchmark",
-                       "sequence_labelling",
+                       "sequence_labeling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
@@ -40,7 +40,7 @@ def test_cli(self):
         with contextlib.redirect_stdout(sio):
             run_module("vecto",
                        "benchmark",
-                       "sequence_labelling",
+                       "sequence_labeling",
                        path_emb,
                        path_sequence_labeling_dataset_ner,
                        "--path_out", "/tmp/vecto/benchmarks/")
@@ -50,7 +50,7 @@ def test_cli(self):
             with contextlib.redirect_stdout(sio):
                 run_module("vecto",
                            "benchmark",
-                           "sequence_labelling",
+                           "sequence_labeling",
                            path_emb + "NONEXISTING",
                            path_sequence_labeling_dataset_ner,
                            "--path_out",

diff --git a/tests/test_cli_misc.py b/tests/test_cli_misc.py
@@ -0,0 +1,14 @@
+import unittest
+from io import StringIO
+from contextlib import redirect_stdout
+from .test_setup import run_module
+
+
+class Tests(unittest.TestCase):
+
+    def test_cli(self):
+        with self.assertRaises(SystemExit):
+            sio = StringIO()
+            with redirect_stdout(sio):
+                run_module('vecto',
+                           'WRONG_COMMAND')
diff --git a/vecto/_version.py b/vecto/_version.py
@@ -1,3 +1,3 @@
 """Version of vecto package."""
 
-VERSION = "0.1.8"
+VERSION = "0.1.9"
diff --git a/vecto/benchmarks/__init__.py b/vecto/benchmarks/__init__.py
@@ -4,19 +4,38 @@
     :toctree: _autosummary
 
     analogy
+    categorization
+    language_modeling
+    outliers
+    relation_extraction
+    sequence_labeling
+    similarity
+    synonymy_detection
+    text_classification
 
 """
 
 import argparse
 import importlib
 
 
-def list_benhcmarks():
+def list_benhcmarks(benchmarks):
     print("available benchmarks:")
-    # TODO: list benchmarks
+    for i in benchmarks:
+        print(i)
 
 
 def _run(args=None):
+    # TODO: load them from modules themselves
+    available_benchmarks = []
+    available_benchmarks.append("analogy")
+    available_benchmarks.append("categorization")
+    available_benchmarks.append("language_modeling")
+    available_benchmarks.append("relation_extraction")
+    available_benchmarks.append("similarity")
+    available_benchmarks.append("sequence_labeling")
+    available_benchmarks.append("text_classification")
+
     parser = argparse.ArgumentParser(
         description='run benchmarks',
         add_help=True,
@@ -25,37 +44,19 @@ def _run(args=None):
     parser.add_argument('name', help='Subcommand to run')
     args, unknownargs = parser.parse_known_args(args)
     if args.name == "help":
-        list_benhcmarks()
+        list_benhcmarks(available_benchmarks)
         return
+
+    # TODO: implement running set of benchmarks defined in config
     # if args.name == "all":
         # print("running all benchmarks")
 
-    options = {}
-
-    if args.name == "analogy":
-        print("running analogy")
-        from .analogy import run
+    if args.name in available_benchmarks:
+        print("running ", args.name)
+        mod = importlib.import_module("vecto.benchmarks." + args.name)
+        run = getattr(mod, 'run')
         run(unknownargs)
-    elif args.name == "categorization":
-        print("running categorization")
-        from .categorization import run
-        run(options, unknownargs)
-    elif args.name == "similarity":
-        print("running similarity")
-        from .similarity import run
-        run(options, unknownargs)
-    elif args.name == "sequence_labelling":
-        print("running sequence labelling")
-        from .sequence_labeling import run
-        run(options, unknownargs)
-    elif args.name == "text_classification":
-        print("running sequence labelling")
-        from .text_classification import run
-        run(options, unknownargs)
     else:
         print("unknown benchmark name", args.name)
-        list_benhcmarks()
+        list_benhcmarks(available_benchmarks)
         exit(-1)
-    # check if all is specified - then run all
-    # if benchmark name matches - run corresponding module
-    # list all available benchmarks
diff --git a/vecto/benchmarks/analogy/__init__.py b/vecto/benchmarks/analogy/__init__.py
@@ -1,3 +1,11 @@
+"""Benchmark on word analogy
+
+.. autosummary::
+    :toctree: _autosummary
+
+    analogy
+"""
+
 import argparse
 import logging
 from .analogy import Analogy
@@ -18,4 +26,4 @@ def run(args):
     parser.add_argument("--path_out", help="destination folder to save results")
     args = parser.parse_args(args)
     benchmark = Analogy(method=args.method)
-    benchmark.run_with_args(args)
+    benchmark.run_with_args(args)
diff --git a/vecto/benchmarks/analogy/analogy.py b/vecto/benchmarks/analogy/analogy.py
@@ -1,15 +1,17 @@
 import datetime
 import os
 import uuid
-import numpy as np
 import logging
 import progressbar
 # from tqdm import tqdm
 import sklearn
 from vecto.data import Dataset
 from ..base import Benchmark
 from .io import get_pairs
-from .solvers import *
+from .solvers import LinearOffset, LRCos, PairDistance
+from .solvers import ThreeCosAvg, ThreeCosMul, ThreeCosMul2
+from .solvers import SimilarToAny, SimilarToB
+
 
 logger = logging.getLogger(__name__)
 
@@ -64,7 +66,6 @@ def __init__(self,
 
         self.stats = {}
 
-
         # this are some hard-coded bits which will be implemented later
         self.result_miss = {
             "rank": -1,
@@ -102,9 +103,7 @@ def __init__(self,
     #         distances[i] = scores[ids_max[i + 1]]
     #     return distances.mean()
 
-
     def run_category(self, pairs):
-
         details = []
         kfold = sklearn.model_selection.KFold(n_splits=len(pairs) // self.size_cv_test)
         cnt_splits = kfold.get_n_splits(pairs)
@@ -231,6 +230,3 @@ def get_result(self, embeddings, path_dataset):  # , group_subcategory=False
             embeddings.normalize()
         results = self.run(embeddings, path_dataset)  #group_subcategory
         return results
-
-
-
diff --git a/vecto/benchmarks/categorization/__init__.py b/vecto/benchmarks/categorization/__init__.py
@@ -7,16 +7,16 @@
 
 def select_method(key):
     options = {}
-    if key == 'SpectralCategorization':
-        method = SpectralCategorization(options)
+    # if key == 'SpectralCategorization':
+    #     method = SpectralCategorization(options)
     if key == 'KMeansCategorization':
         method = KMeansCategorization(options)
     else:
         raise RuntimeError('The method name was not recognized.')
     return method
 
 
-def run(options, extra_args):
+def run(extra_args):
     parser = argparse.ArgumentParser()
     parser.add_argument('embeddings')
     parser.add_argument('dataset')