Skip to content

Commit

Permalink
Merge 3ea5bed into 373c017
Browse files Browse the repository at this point in the history
  • Loading branch information
undertherain committed Jan 23, 2019
2 parents 373c017 + 3ea5bed commit 1731a22
Show file tree
Hide file tree
Showing 29 changed files with 488 additions and 386 deletions.
8 changes: 8 additions & 0 deletions tests/benchmarks/test_categorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ def test_cli(self):
'./tests/data/benchmarks/categorization/',
'--path_out', '/tmp/vecto/benchmarks',
'--method', 'KMeansCategorization')
# with redirect_stdout(sio):
# run_module('vecto',
# 'benchmark',
# 'categorization',
# './tests/data/embeddings/text/plain_with_file_header/',
# './tests/data/benchmarks/categorization/',
# '--path_out', '/tmp/vecto/benchmarks',
# '--method', 'SpectralCategorization')

def test_categorization_scores(self):
embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
Expand Down
20 changes: 14 additions & 6 deletions tests/benchmarks/test_language_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,28 +39,36 @@ def test_api(self):
def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.language_modeling",
run_module("vecto",
"benchmark",
"language_modeling",
path_emb,
"--window_size", "5",
"--path_out", "/tmp/vecto/benchmarks/")

sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.language_modeling",
run_module("vecto",
"benchmark",
"language_modeling",
path_emb,
"--method", "lr",
"--path_out", "/tmp/vecto/benchmarks/tmp")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto.benchmarks.language_modeling",
run_module("vecto",
"benchmark",
"language_modeling",
path_emb + "NONEXISTING",
"--path_out", "/tmp/vecto/benchmarks/")

from matplotlib import pyplot as plt
visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling", key_secondary="experiment_setup.dataset")
plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf", bbox_inches="tight")
visualize.plot_accuracy("/tmp/vecto/benchmarks/language_modeling",
key_secondary="experiment_setup.dataset")
plt.savefig("/tmp/vecto/benchmarks/language_modeling.pdf",
bbox_inches="tight")


Tests().test_cli()
# Tests().test_cli()
18 changes: 9 additions & 9 deletions tests/benchmarks/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# from vecto.benchmarks.similarity import visualize as similarity_visualize
from vecto.benchmarks.text_classification import Text_classification
from vecto.embeddings import load_from_dir
from vecto.utils.fetch_benchmarks import fetch_benchmarks
# from vecto.utils.fetch_benchmarks import fetch_benchmarks
from os import path

# from shutil import rmtree
Expand All @@ -26,14 +26,14 @@

class Tests(unittest.TestCase):

def test_fetcher(self):
if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
return
fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
similarity = Similarity()
path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
similarity.get_result(embs, path_similarity_dataset)
# def test_fetcher(self):
# if path.isdir(path.join('.', 'tests', 'data', 'benchmarks_test')):
# return
# fetch_benchmarks(path.join('.', 'tests', 'data', 'benchmarks_test'))
# embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header'))
# similarity = Similarity()
# path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks_test', 'benchmarks', 'similarity', 'en')
# similarity.get_result(embs, path_similarity_dataset)

def test_abc(self):
with self.assertRaises(NotImplementedError):
Expand Down
50 changes: 50 additions & 0 deletions tests/benchmarks/test_relation_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Tests for analogy benchmark."""

import contextlib
import unittest
import io
from os import path
from vecto.benchmarks import visualize
from vecto.embeddings import load_from_dir
from tests.test_setup import run_module


path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')
path_dataset = path.join('tests', 'data', 'benchmarks', 'relation_extraction')


class Tests(unittest.TestCase):
# def test_api(self):
# embs = load_from_dir(path_emb)

# for method in ['lr', '2FFNN']:
# sequence_labeling = Sequence_labeling(method=method)
# for subtask in ['chunk', 'pos', 'ner']: # , 'chunk', 'pos', 'ner'
# result = sequence_labeling.get_result(embs, path.join(path_sequence_labeling_dataset, subtask))
# self.assertIsInstance(result[0], dict)
# print(result)

def test_cli(self):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto",
"benchmark",
"relation_extraction",
path_emb,
path_dataset,
"--path_out", "/tmp/vecto/benchmarks/")

with self.assertRaises(FileNotFoundError):
sio = io.StringIO()
with contextlib.redirect_stdout(sio):
run_module("vecto",
"benchmark",
"relation_extraction",
path_emb + "NONEXISTING",
path_dataset,
"--path_out",
"/tmp/vecto/benchmarks/")

from matplotlib import pyplot as plt
visualize.plot_accuracy("/tmp/vecto/benchmarks/relation_extraction", key_secondary="experiment_setup.dataset")
plt.savefig("/tmp/vecto/benchmarks/relation_extraction.pdf", bbox_inches="tight")
6 changes: 3 additions & 3 deletions tests/benchmarks/test_sequence_labeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def test_cli(self):
with contextlib.redirect_stdout(sio):
run_module("vecto",
"benchmark",
"sequence_labelling",
"sequence_labeling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")
Expand All @@ -40,7 +40,7 @@ def test_cli(self):
with contextlib.redirect_stdout(sio):
run_module("vecto",
"benchmark",
"sequence_labelling",
"sequence_labeling",
path_emb,
path_sequence_labeling_dataset_ner,
"--path_out", "/tmp/vecto/benchmarks/")
Expand All @@ -50,7 +50,7 @@ def test_cli(self):
with contextlib.redirect_stdout(sio):
run_module("vecto",
"benchmark",
"sequence_labelling",
"sequence_labeling",
path_emb + "NONEXISTING",
path_sequence_labeling_dataset_ner,
"--path_out",
Expand Down
14 changes: 14 additions & 0 deletions tests/test_cli_misc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import unittest
from io import StringIO
from contextlib import redirect_stdout
from .test_setup import run_module


class Tests(unittest.TestCase):

def test_cli(self):
with self.assertRaises(SystemExit):
sio = StringIO()
with redirect_stdout(sio):
run_module('vecto',
'WRONG_COMMAND')
2 changes: 1 addition & 1 deletion vecto/_version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Version of vecto package."""

VERSION = "0.1.8"
VERSION = "0.1.9"
57 changes: 29 additions & 28 deletions vecto/benchmarks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,38 @@
:toctree: _autosummary
analogy
categorization
language_modeling
outliers
relation_extraction
sequence_labeling
similarity
synonymy_detection
text_classification
"""

import argparse
import importlib


def list_benhcmarks():
def list_benhcmarks(benchmarks):
print("available benchmarks:")
# TODO: list benchmarks
for i in benchmarks:
print(i)


def _run(args=None):
# TODO: load them from modules themselves
available_benchmarks = []
available_benchmarks.append("analogy")
available_benchmarks.append("categorization")
available_benchmarks.append("language_modeling")
available_benchmarks.append("relation_extraction")
available_benchmarks.append("similarity")
available_benchmarks.append("sequence_labeling")
available_benchmarks.append("text_classification")

parser = argparse.ArgumentParser(
description='run benchmarks',
add_help=True,
Expand All @@ -25,37 +44,19 @@ def _run(args=None):
parser.add_argument('name', help='Subcommand to run')
args, unknownargs = parser.parse_known_args(args)
if args.name == "help":
list_benhcmarks()
list_benhcmarks(available_benchmarks)
return

# TODO: implement running set of benchmarks defined in config
# if args.name == "all":
# print("running all benchmarks")

options = {}

if args.name == "analogy":
print("running analogy")
from .analogy import run
if args.name in available_benchmarks:
print("running ", args.name)
mod = importlib.import_module("vecto.benchmarks." + args.name)
run = getattr(mod, 'run')
run(unknownargs)
elif args.name == "categorization":
print("running categorization")
from .categorization import run
run(options, unknownargs)
elif args.name == "similarity":
print("running similarity")
from .similarity import run
run(options, unknownargs)
elif args.name == "sequence_labelling":
print("running sequence labelling")
from .sequence_labeling import run
run(options, unknownargs)
elif args.name == "text_classification":
print("running sequence labelling")
from .text_classification import run
run(options, unknownargs)
else:
print("unknown benchmark name", args.name)
list_benhcmarks()
list_benhcmarks(available_benchmarks)
exit(-1)
# check if all is specified - then run all
# if benchmark name matches - run corresponding module
# list all available benchmarks
10 changes: 9 additions & 1 deletion vecto/benchmarks/analogy/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""Benchmark on word analogy
.. autosummary::
:toctree: _autosummary
analogy
"""

import argparse
import logging
from .analogy import Analogy
Expand All @@ -18,4 +26,4 @@ def run(args):
parser.add_argument("--path_out", help="destination folder to save results")
args = parser.parse_args(args)
benchmark = Analogy(method=args.method)
benchmark.run_with_args(args)
benchmark.run_with_args(args)
12 changes: 4 additions & 8 deletions vecto/benchmarks/analogy/analogy.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import datetime
import os
import uuid
import numpy as np
import logging
import progressbar
# from tqdm import tqdm
import sklearn
from vecto.data import Dataset
from ..base import Benchmark
from .io import get_pairs
from .solvers import *
from .solvers import LinearOffset, LRCos, PairDistance
from .solvers import ThreeCosAvg, ThreeCosMul, ThreeCosMul2
from .solvers import SimilarToAny, SimilarToB


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -64,7 +66,6 @@ def __init__(self,

self.stats = {}


# this are some hard-coded bits which will be implemented later
self.result_miss = {
"rank": -1,
Expand Down Expand Up @@ -102,9 +103,7 @@ def __init__(self,
# distances[i] = scores[ids_max[i + 1]]
# return distances.mean()


def run_category(self, pairs):

details = []
kfold = sklearn.model_selection.KFold(n_splits=len(pairs) // self.size_cv_test)
cnt_splits = kfold.get_n_splits(pairs)
Expand Down Expand Up @@ -231,6 +230,3 @@ def get_result(self, embeddings, path_dataset): # , group_subcategory=False
embeddings.normalize()
results = self.run(embeddings, path_dataset) #group_subcategory
return results



6 changes: 3 additions & 3 deletions vecto/benchmarks/categorization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,16 @@

def select_method(key):
options = {}
if key == 'SpectralCategorization':
method = SpectralCategorization(options)
# if key == 'SpectralCategorization':
# method = SpectralCategorization(options)
if key == 'KMeansCategorization':
method = KMeansCategorization(options)
else:
raise RuntimeError('The method name was not recognized.')
return method


def run(options, extra_args):
def run(extra_args):
parser = argparse.ArgumentParser()
parser.add_argument('embeddings')
parser.add_argument('dataset')
Expand Down

0 comments on commit 1731a22

Please sign in to comment.