Skip to content

Commit

Permalink
Merge 20bb550 into fc7a512
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Jan 16, 2019
2 parents fc7a512 + 20bb550 commit 1a414f7
Show file tree
Hide file tree
Showing 26 changed files with 744 additions and 599 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# project-specific

_autosummary


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
47 changes: 26 additions & 21 deletions tests/benchmarks/test_analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unittest
import io
from os import path
from vecto.benchmarks.analogy import *
from vecto.benchmarks.analogy import Analogy
from vecto.benchmarks import visualize
from vecto.embeddings import load_from_dir
from ..test_setup import run_module
Expand All @@ -17,55 +17,59 @@ class Tests(unittest.TestCase):

def test_api(self):
embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
analogy = LinearOffset()
analogy = Analogy(method="3CosAdd")
result = analogy.get_result(embs, path_analogy_dataset)
self.assertIsInstance(result[0], dict)

analogy = PairDistance()
analogy = Analogy(method="PairDistance")
result = analogy.get_result(embs, path_analogy_dataset)
self.assertIsInstance(result[0], dict)

analogy = ThreeCosMul()
analogy = Analogy(method="3CosMul")
result = analogy.get_result(embs, path_analogy_dataset)
self.assertIsInstance(result[0], dict)

analogy = ThreeCosMul2()
analogy = Analogy(method="3CosMul2")
result = analogy.get_result(embs, path_analogy_dataset)
self.assertIsInstance(result[0], dict)

analogy = ThreeCosAvg()
analogy = Analogy(method="3CosAvg")
result = analogy.get_result(embs, path_analogy_dataset)
self.assertIsInstance(result[0], dict)

# analogy = SimilarToAny()
# result = analogy.get_result(embs, path_analogy_dataset)
# print(result)
# analogy = SimilarToB()
# result = analogy.get_result(embs, path_analogy_dataset)
# print(result)
analogy = LRCos()
analogy = Analogy(method="SimilarToAny")
result = analogy.get_result(embs, path_analogy_dataset)
print(result)

analogy = Analogy(method="SimilarToB")
result = analogy.get_result(embs, path_analogy_dataset)
print(result)

analogy = Analogy(method="LRCos")
result = analogy.get_result(embs, path_analogy_dataset)
print(result)

def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.analogy",
run_module("vecto", "benchmark", "analogy",
"./tests/data/embeddings/text/plain_with_file_header/",
"./tests/data/benchmarks/analogy/",
"--path_out", "/tmp/vecto/benchmarks/", "--method", "3CosAdd")
"--path_out", "/tmp/vecto/benchmarks/",
"--method", "3CosAdd")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.analogy",
run_module("vecto", "benchmark", "analogy",
"./tests/data/embeddings/text/plain_with_file_header/",
"./tests/data/benchmarks/analogy/",
"--path_out", "/tmp/vecto/benchmarks/specific_filename.json",
"--path_out",
"/tmp/vecto/benchmarks/specific_filename.json",
"--method", "LRCos")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.analogy",
run_module("vecto", "benchmark", "analogy",
"./tests/data/embeddings/text/plain_with_file_header/",
"./tests/data/benchmarks/analogy/",
"--path_out", "/tmp/vecto/benchmarks/",
Expand All @@ -74,18 +78,19 @@ def test_cli(self):
sio = io.StringIO()
with self.assertRaises(RuntimeError):
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.analogy",
run_module("vecto", "benchmark", "analogy",
"./tests/data/embeddings/text/plain_with_file_header/",
"./tests/data/benchmarks/analogy/",
"--method", "NONEXISTING")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.analogy",
run_module("vecto", "benchmark", "analogy",
"./tests/data/embeddings/text/plain_with_file_header/",
"./tests/data/benchmarks/analogy/",
"--method", "3CosAvg")

# TODO: suppress concatenating timestamp or aggregate multiple runs
from matplotlib import pyplot as plt
visualize.plot_accuracy("/tmp/vecto/benchmarks/analogy")
visualize.plot_accuracy("/tmp/vecto/benchmarks/analogical_reasoning")
plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight")
7 changes: 5 additions & 2 deletions tests/benchmarks/test_categorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,13 @@ def test_categorization_method_works(self):
def test_cli(self):
sio = StringIO()
with redirect_stdout(sio):
run_module('vecto.benchmarks.categorization',
run_module('vecto',
'benchmark',
'categorization',
'./tests/data/embeddings/text/plain_with_file_header/',
'./tests/data/benchmarks/categorization/',
'--path_out', '/tmp/vecto/benchmarks', '--method', 'KMeansCategorization')
'--path_out', '/tmp/vecto/benchmarks',
'--method', 'KMeansCategorization')

def test_categorization_scores(self):
embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
Expand Down
16 changes: 12 additions & 4 deletions tests/benchmarks/test_sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vecto.embeddings import load_from_dir
from tests.test_setup import run_module


path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner)
path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
Expand All @@ -28,25 +29,32 @@ def test_api(self):
def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb + "NONEXISTING",
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")
"--path_out",
"/tmp/vecto/benchmarks/")

from matplotlib import pyplot as plt
# here the visualization only for the ner sub task.
Expand Down
13 changes: 10 additions & 3 deletions tests/benchmarks/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vecto.embeddings import load_from_dir
from tests.test_setup import run_module


path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')

Expand All @@ -35,22 +36,28 @@ def test_api(self):
def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb,
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb,
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/tmp")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb + "NONEXISTING",
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/")
Expand Down
1 change: 1 addition & 0 deletions tests/test_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def run_pip(*args, **kwargs):
run_program(pip_exec_name, *args, **kwargs)


# TODO: move this to helper module
def run_module(name: str, *args, run_name: str = '__main__') -> None:
backup_sys_argv = sys.argv
sys.argv = [name + '.py'] + list(args)
Expand Down
4 changes: 4 additions & 0 deletions vecto/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .cli import CLI

if __name__ == "__main__":
CLI()
2 changes: 1 addition & 1 deletion vecto/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version of vecto package."""

VERSION = "0.1.7"
VERSION = "0.1.8"
49 changes: 49 additions & 0 deletions vecto/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,52 @@
analogy
"""

import argparse
import importlib


def list_benhcmarks():
print("available benchmarks:")
# TODO: list benchmarks


def _run(args=None):
parser = argparse.ArgumentParser(
description='run benchmarks',
add_help=True,
usage="vecto benchmark [name]")

parser.add_argument('name', help='Subcommand to run')
args, unknownargs = parser.parse_known_args(args)
if args.name == "help":
list_benhcmarks()
return
# if args.name == "all":
# print("running all benchmarks")

options = {}

if args.name == "analogy":
print("running analogy")
from .analogy import run
run(unknownargs)
elif args.name == "categorization":
print("running categorization")
from .categorization import run
run(options, unknownargs)
elif args.name == "similarity":
print("running similarity")
from .similarity import run
run(options, unknownargs)
elif args.name == "sequence_labelling":
print("running sequence labelling")
from .sequence_labeling import run
run(options, unknownargs)
else:
print("unknown benchmark name")
list_benhcmarks()
exit(-1)
# check if all is specified - then run all
# if benchmark name matches - run corresponding module
# list all available benchmarks
42 changes: 41 additions & 1 deletion vecto/benchmarks/analogy/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,41 @@
from .analogy import ThreeCosAvg, ThreeCosMul, ThreeCosMul2, LinearOffset, LRCos, PairDistance
import argparse
import logging
import os
from .analogy import Analogy
from vecto.utils.data import save_json, print_json
# from vecto.config import load_config
from vecto.embeddings import load_from_dir
from vecto.utils import get_time_str

logging.basicConfig(level=logging.DEBUG)


def run(args):
# config = load_config()
# print(config)
print(args)
parser = argparse.ArgumentParser()
parser.add_argument("embeddings")
parser.add_argument("dataset")
parser.add_argument("--method", help="analogy solving method", default="LRCos")
parser.add_argument("--path_out", help="destination folder to save results")
args = parser.parse_args(args)
embeddings = load_from_dir(args.embeddings)
# print("embeddings", embeddings)
benchmark = Analogy(method=args.method)
results = benchmark.get_result(embeddings, args.dataset)
if args.path_out:
if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
dataset = os.path.basename(os.path.normpath(args.dataset))
timestamp = get_time_str()
name_file_out = os.path.join(args.path_out,
"analogical_reasoning",
dataset,
args.method,
timestamp,
"results.json")
save_json(results, name_file_out)
else:
save_json(results, args.path_out)
else:
print_json(results)
64 changes: 0 additions & 64 deletions vecto/benchmarks/analogy/__main__.py

This file was deleted.

Loading

0 comments on commit 1a414f7

Please sign in to comment.