Skip to content

Commit

Permalink
new cli for sequence labelling
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Jan 15, 2019
1 parent 21d0927 commit bc98647
Show file tree
Hide file tree
Showing 6 changed files with 59 additions and 48 deletions.
16 changes: 12 additions & 4 deletions tests/benchmarks/test_sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vecto.embeddings import load_from_dir
from tests.test_setup import run_module


path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling')
path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner)
path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
Expand All @@ -28,25 +29,32 @@ def test_api(self):
def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.sequence_labeling",
run_module("vecto",
"benchmark",
"sequence_labelling",
path_emb + "NONEXISTING",
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")
"--path_out",
"/tmp/vecto/benchmarks/")

from matplotlib import pyplot as plt
# here the visualization only for the ner sub task.
Expand Down
13 changes: 10 additions & 3 deletions tests/benchmarks/test_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from vecto.embeddings import load_from_dir
from tests.test_setup import run_module


path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity')
path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')

Expand All @@ -35,22 +36,28 @@ def test_api(self):
def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb,
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb,
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/tmp")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.similarity",
run_module("vecto",
"benchmark",
"similarity",
path_emb + "NONEXISTING",
path_similarity_dataset,
"--path_out", "/tmp/vecto/benchmarks/")
Expand Down
6 changes: 6 additions & 0 deletions vecto/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,14 @@ def _run(args=None):
print("running similarity")
from .similarity import run
run(options, unknownargs)
elif args.name == "sequence_labelling":
print("running sequence labelling")
from .sequence_labeling import run
run(options, unknownargs)
else:
print("unknown benchmark name")
list_benhcmarks()
exit(-1)
# check if all is specified - then run all
# if benchmark name matches - run corresponding module
# list all available benchmarks
32 changes: 31 additions & 1 deletion vecto/benchmarks/sequence_labeling/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,31 @@
from .sequence_labeling import Sequence_labeling
import argparse
import os
from vecto.embeddings import load_from_dir
from vecto.utils.data import save_json, print_json
from .sequence_labeling import Sequence_labeling


def run(options, extra_args):

parser = argparse.ArgumentParser()
parser.add_argument("embeddings")
parser.add_argument("dataset")
parser.add_argument("--window_size", default=5, type=int)
parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'],
help='name of method')
parser.add_argument('--normalize', dest='normalize', action='store_true')
parser.add_argument("--path_out", default=False, help="destination folder to save results")
args = parser.parse_args(extra_args)
embeddings = load_from_dir(args.embeddings)
# print("embeddings:", embeddings)
sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size)
results = sequence_labeling.get_result(embeddings, args.dataset)
if args.path_out:
if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
dataset = os.path.basename(os.path.normpath(args.dataset))
name_file_out = os.path.join(args.path_out, dataset, "results.json")
save_json(results, name_file_out)
else:
save_json(results, args.path_out)
else:
print_json(results)
39 changes: 0 additions & 39 deletions vecto/benchmarks/sequence_labeling/__main__.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,6 @@
import argparse
import json
import logging
import os

from vecto.utils.data import save_json
from vecto.benchmarks.sequence_labeling import Sequence_labeling
from vecto.embeddings import load_from_dir

logging.basicConfig(level=logging.DEBUG)


def print_json(data):
print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False))


def main():
# config = load_config()
# print(config)
parser = argparse.ArgumentParser()
parser.add_argument("embeddings")
parser.add_argument("dataset")
parser.add_argument("--window_size", default=5, type=int)
parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'],
help='name of method')
parser.add_argument('--normalize', dest='normalize', action='store_true')
parser.add_argument("--path_out", default=False, help="destination folder to save results")
args = parser.parse_args()
embeddings = load_from_dir(args.embeddings)
# print("embeddings", embeddings)
sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size)
results = sequence_labeling.get_result(embeddings, args.dataset)
if args.path_out:
if os.path.isdir(args.path_out) or args.path_out.endswith("/"):
dataset = os.path.basename(os.path.normpath(args.dataset))
name_file_out = os.path.join(args.path_out, dataset, "results.json")
save_json(results, name_file_out)
else:
save_json(results, args.path_out)
else:
print_json(results)


if __name__ == "__main__":
main()
1 change: 0 additions & 1 deletion vecto/benchmarks/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ def df_from_file(path):
logger.warning(f"default_measurement not specified in {path}")
dframe["result"] = dframe["result." + default_measurement]
# df["reciprocal_rank"] = 1 / (df["rank"] + 1)
print(dframe)
return dframe


Expand Down

0 comments on commit bc98647

Please sign in to comment.