diff --git a/.gitignore b/.gitignore index 216e06a5..d8dba26e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# project-specific + +_autosummary + + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/tests/benchmarks/test_analogy.py b/tests/benchmarks/test_analogy.py index 7bfe5480..78bee6e6 100644 --- a/tests/benchmarks/test_analogy.py +++ b/tests/benchmarks/test_analogy.py @@ -4,7 +4,7 @@ import unittest import io from os import path -from vecto.benchmarks.analogy import * +from vecto.benchmarks.analogy import Analogy from vecto.benchmarks import visualize from vecto.embeddings import load_from_dir from ..test_setup import run_module @@ -17,55 +17,59 @@ class Tests(unittest.TestCase): def test_api(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) - analogy = LinearOffset() + analogy = Analogy(method="3CosAdd") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) - analogy = PairDistance() + analogy = Analogy(method="PairDistance") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) - analogy = ThreeCosMul() + analogy = Analogy(method="3CosMul") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) - analogy = ThreeCosMul2() + analogy = Analogy(method="3CosMul2") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) - analogy = ThreeCosAvg() + analogy = Analogy(method="3CosAvg") result = analogy.get_result(embs, path_analogy_dataset) self.assertIsInstance(result[0], dict) - # analogy = SimilarToAny() - # result = analogy.get_result(embs, path_analogy_dataset) - # print(result) - # analogy = SimilarToB() - # result = analogy.get_result(embs, path_analogy_dataset) - # print(result) - analogy = LRCos() + analogy = Analogy(method="SimilarToAny") + result = analogy.get_result(embs, path_analogy_dataset) + print(result) + + analogy = Analogy(method="SimilarToB") + result = analogy.get_result(embs, path_analogy_dataset) + print(result) + + analogy = Analogy(method="LRCos") result = analogy.get_result(embs, path_analogy_dataset) print(result) def test_cli(self): sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.analogy", + run_module("vecto", "benchmark", "analogy", "./tests/data/embeddings/text/plain_with_file_header/", "./tests/data/benchmarks/analogy/", - "--path_out", "/tmp/vecto/benchmarks/", "--method", "3CosAdd") + "--path_out", "/tmp/vecto/benchmarks/", + "--method", "3CosAdd") sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.analogy", + run_module("vecto", "benchmark", "analogy", "./tests/data/embeddings/text/plain_with_file_header/", "./tests/data/benchmarks/analogy/", - "--path_out", "/tmp/vecto/benchmarks/specific_filename.json", + "--path_out", + "/tmp/vecto/benchmarks/specific_filename.json", "--method", "LRCos") sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.analogy", + run_module("vecto", "benchmark", "analogy", "./tests/data/embeddings/text/plain_with_file_header/", "./tests/data/benchmarks/analogy/", "--path_out", "/tmp/vecto/benchmarks/", @@ -74,18 +78,19 @@ def test_cli(self): sio = io.StringIO() with self.assertRaises(RuntimeError): with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.analogy", + run_module("vecto", "benchmark", "analogy", "./tests/data/embeddings/text/plain_with_file_header/", "./tests/data/benchmarks/analogy/", "--method", "NONEXISTING") sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.analogy", + run_module("vecto", "benchmark", "analogy", "./tests/data/embeddings/text/plain_with_file_header/", "./tests/data/benchmarks/analogy/", "--method", "3CosAvg") + # TODO: suppress concatenating timestamp or aggregate multiple runs from matplotlib import pyplot as plt - visualize.plot_accuracy("/tmp/vecto/benchmarks/analogy") + visualize.plot_accuracy("/tmp/vecto/benchmarks/word_analogy") plt.savefig("/tmp/vecto/benchmarks/analogy.pdf", bbox_inches="tight") diff --git a/tests/benchmarks/test_categorization.py b/tests/benchmarks/test_categorization.py index 3441fe35..1facdb4d 100644 --- a/tests/benchmarks/test_categorization.py +++ b/tests/benchmarks/test_categorization.py @@ -25,10 +25,13 @@ def test_categorization_method_works(self): def test_cli(self): sio = StringIO() with redirect_stdout(sio): - run_module('vecto.benchmarks.categorization', + run_module('vecto', + 'benchmark', + 'categorization', './tests/data/embeddings/text/plain_with_file_header/', './tests/data/benchmarks/categorization/', - '--path_out', '/tmp/vecto/benchmarks', '--method', 'KMeansCategorization') + '--path_out', '/tmp/vecto/benchmarks', + '--method', 'KMeansCategorization') def test_categorization_scores(self): embs = load_from_dir(path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header')) diff --git a/tests/benchmarks/test_sequence_labeling.py b/tests/benchmarks/test_sequence_labeling.py index 712e9c2c..9f5c71d1 100644 --- a/tests/benchmarks/test_sequence_labeling.py +++ b/tests/benchmarks/test_sequence_labeling.py @@ -9,6 +9,7 @@ from vecto.embeddings import load_from_dir from tests.test_setup import run_module + path_sequence_labeling_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling') path_sequence_labeling_dataset_ner = path.join('.', 'tests', 'data', 'benchmarks', 'sequence_labeling', 'ner') # sequence labeling need to specify a sub task (pos, chunk, or ner) path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') @@ -28,14 +29,18 @@ def test_api(self): def test_cli(self): sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.sequence_labeling", + run_module("vecto", + "benchmark", + "sequence_labelling", path_emb, path_sequence_labeling_dataset_ner, "--path_out", "/tmp/vecto/benchmarks/") sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.sequence_labeling", + run_module("vecto", + "benchmark", + "sequence_labelling", path_emb, path_sequence_labeling_dataset_ner, "--path_out", "/tmp/vecto/benchmarks/") @@ -43,12 +48,15 @@ def test_cli(self): with self.assertRaises(FileNotFoundError): sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.sequence_labeling", + run_module("vecto", + "benchmark", + "sequence_labelling", path_emb + "NONEXISTING", path_sequence_labeling_dataset_ner, - "--path_out", "/tmp/vecto/benchmarks/") + "--path_out", + "/tmp/vecto/benchmarks/") from matplotlib import pyplot as plt # here the visualization only for the ner sub task. - visualize.plot_accuracy("/tmp/vecto/benchmarks/ner", key_secondary="experiment_setup.dataset") + visualize.plot_accuracy("/tmp/vecto/benchmarks/sequence_labeling/ner", key_secondary="experiment_setup.dataset") plt.savefig("/tmp/vecto/benchmarks/sequence_labeling.pdf", bbox_inches="tight") diff --git a/tests/benchmarks/test_similarity.py b/tests/benchmarks/test_similarity.py index 597143c1..4c776aba 100644 --- a/tests/benchmarks/test_similarity.py +++ b/tests/benchmarks/test_similarity.py @@ -9,6 +9,7 @@ from vecto.embeddings import load_from_dir from tests.test_setup import run_module + path_similarity_dataset = path.join('.', 'tests', 'data', 'benchmarks', 'similarity') path_emb = path.join('tests', 'data', 'embeddings', 'text', 'plain_with_file_header') @@ -35,14 +36,18 @@ def test_api(self): def test_cli(self): sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.similarity", + run_module("vecto", + "benchmark", + "similarity", path_emb, path_similarity_dataset, "--path_out", "/tmp/vecto/benchmarks/") sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.similarity", + run_module("vecto", + "benchmark", + "similarity", path_emb, path_similarity_dataset, "--path_out", "/tmp/vecto/benchmarks/tmp") @@ -50,7 +55,9 @@ def test_cli(self): with self.assertRaises(FileNotFoundError): sio = io.StringIO() with contextlib.redirect_stdout(sio): - run_module("vecto.benchmarks.similarity", + run_module("vecto", + "benchmark", + "similarity", path_emb + "NONEXISTING", path_similarity_dataset, "--path_out", "/tmp/vecto/benchmarks/") diff --git a/tests/test_setup.py b/tests/test_setup.py index 6d753564..e98661a3 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -34,6 +34,7 @@ def run_pip(*args, **kwargs): run_program(pip_exec_name, *args, **kwargs) +# TODO: move this to helper module def run_module(name: str, *args, run_name: str = '__main__') -> None: backup_sys_argv = sys.argv sys.argv = [name + '.py'] + list(args) diff --git a/vecto/__main__.py b/vecto/__main__.py new file mode 100644 index 00000000..386d3f45 --- /dev/null +++ b/vecto/__main__.py @@ -0,0 +1,4 @@ +from .cli import CLI + +if __name__ == "__main__": + CLI() diff --git a/vecto/_version.py b/vecto/_version.py index 19dda839..889d0894 100644 --- a/vecto/_version.py +++ b/vecto/_version.py @@ -1,3 +1,3 @@ """Version of vecto package.""" -VERSION = "0.1.7" +VERSION = "0.1.8" diff --git a/vecto/benchmarks/__init__.py b/vecto/benchmarks/__init__.py index 3e741b32..659ed222 100644 --- a/vecto/benchmarks/__init__.py +++ b/vecto/benchmarks/__init__.py @@ -6,3 +6,52 @@ analogy """ + +import argparse +import importlib + + +def list_benhcmarks(): + print("available benchmarks:") + # TODO: list benchmarks + + +def _run(args=None): + parser = argparse.ArgumentParser( + description='run benchmarks', + add_help=True, + usage="vecto benchmark [name]") + + parser.add_argument('name', help='Subcommand to run') + args, unknownargs = parser.parse_known_args(args) + if args.name == "help": + list_benhcmarks() + return + # if args.name == "all": + # print("running all benchmarks") + + options = {} + + if args.name == "analogy": + print("running analogy") + from .analogy import run + run(unknownargs) + elif args.name == "categorization": + print("running categorization") + from .categorization import run + run(options, unknownargs) + elif args.name == "similarity": + print("running similarity") + from .similarity import run + run(options, unknownargs) + elif args.name == "sequence_labelling": + print("running sequence labelling") + from .sequence_labeling import run + run(options, unknownargs) + else: + print("unknown benchmark name") + list_benhcmarks() + exit(-1) + # check if all is specified - then run all + # if benchmark name matches - run corresponding module + # list all available benchmarks diff --git a/vecto/benchmarks/analogy/__init__.py b/vecto/benchmarks/analogy/__init__.py index dcdf32bc..801054bd 100644 --- a/vecto/benchmarks/analogy/__init__.py +++ b/vecto/benchmarks/analogy/__init__.py @@ -1 +1,21 @@ -from .analogy import ThreeCosAvg, ThreeCosMul, ThreeCosMul2, LinearOffset, LRCos, PairDistance +import argparse +import logging +from .analogy import Analogy +# from vecto.config import load_config +from vecto.embeddings import load_from_dir + +logging.basicConfig(level=logging.DEBUG) + + +def run(args): + # config = load_config() + # print(config) + print(args) + parser = argparse.ArgumentParser() + parser.add_argument("embeddings") + parser.add_argument("dataset") + parser.add_argument("--method", help="analogy solving method", default="LRCos") + parser.add_argument("--path_out", help="destination folder to save results") + args = parser.parse_args(args) + benchmark = Analogy(method=args.method) + benchmark.run_with_args(args) \ No newline at end of file diff --git a/vecto/benchmarks/analogy/__main__.py b/vecto/benchmarks/analogy/__main__.py deleted file mode 100644 index 82a65a8a..00000000 --- a/vecto/benchmarks/analogy/__main__.py +++ /dev/null @@ -1,64 +0,0 @@ -import argparse -import json -import logging -import os - -from vecto.utils.data import save_json -from vecto.benchmarks.analogy import ThreeCosAvg, ThreeCosMul, LinearOffset, LRCos -# from vecto.config import load_config -from vecto.embeddings import load_from_dir - -logging.basicConfig(level=logging.DEBUG) - - -def print_json(data): - print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) - - -def select_method(key): - options = {} - if key == "3CosAvg": - method = ThreeCosAvg(options) - #elif key == "SimilarToAny": - # method = SimilarToAny(options) - #elif key == "SimilarToB": - # method = SimilarToB(options) - elif key == "3CosMul": - method = ThreeCosMul(options) - elif key == "3CosAdd": - method = LinearOffset(options) - #elif key == "PairDistance": - # method = PairDistance(options) - elif key == "LRCos" or key == "SVMCos": - method = LRCos(options) - else: - raise RuntimeError("method name not recognized") - return method - - -def main(): - # config = load_config() - # print(config) - parser = argparse.ArgumentParser() - parser.add_argument("embeddings") - parser.add_argument("dataset") - parser.add_argument("--method", help="analogy solving method", default="LRCos") - parser.add_argument("--path_out", help="destination folder to save results") - args = parser.parse_args() - embeddings = load_from_dir(args.embeddings) - # print("embeddings", embeddings) - benchmark = select_method(args.method) - results = benchmark.get_result(embeddings, args.dataset) - if args.path_out: - if os.path.isdir(args.path_out) or args.path_out.endswith("/"): - dataset = os.path.basename(os.path.normpath(args.dataset)) - name_file_out = os.path.join(args.path_out, dataset, args.method, "results.json") - save_json(results, name_file_out) - else: - save_json(results, args.path_out) - else: - print_json(results) - - -if __name__ == "__main__": - main() diff --git a/vecto/benchmarks/analogy/analogy.py b/vecto/benchmarks/analogy/analogy.py index 9133c0f4..7e1a07c5 100644 --- a/vecto/benchmarks/analogy/analogy.py +++ b/vecto/benchmarks/analogy/analogy.py @@ -1,77 +1,69 @@ import datetime import os -import random -import scipy import uuid import numpy as np import logging import progressbar # from tqdm import tqdm import sklearn -from sklearn.linear_model import LogisticRegression -from itertools import product from vecto.data import Dataset from ..base import Benchmark +from .io import get_pairs +from .solvers import * logger = logging.getLogger(__name__) -def get_pairs(fname): # todo: optional lower-casing, move to some io module - pairs = [] - with open(fname) as file_in: - id_line = 0 - for line in file_in: - if line.strip() == '': - continue - try: - id_line += 1 - if "\t" in line: - parts = line.lower().split("\t") - else: - parts = line.lower().split() - left = parts[0] - right = parts[1] - right = right.strip() - if "/" in right: - right = [i.strip() for i in right.split("/")] - else: - right = [i.strip() for i in right.split(",")] - pairs.append([left, right]) - except: - print("error reading pairs") - print("in file", fname) - print("in line", id_line, line) - exit(-1) - return pairs +def select_method(key): + if key == "3CosAvg": + method = ThreeCosAvg + elif key == "SimilarToAny": + method = SimilarToAny + elif key == "SimilarToB": + method = SimilarToB + elif key == "3CosMul": + method = ThreeCosMul + elif key == "3CosMul2": + method = ThreeCosMul2 + elif key == "3CosAdd": + method = LinearOffset + elif key == "PairDistance": + method = PairDistance + elif key == "LRCos" or key == "SVMCos": + method = LRCos + else: + raise RuntimeError("method name not recognized") + return method class Analogy(Benchmark): - def __init__(self, normalize=True, + def __init__(self, + method="3CosAdd", + normalize=True, ignore_oov=True, do_top5=True, # need_subsample=False, size_cv_test=1, set_aprimes_test=None, - inverse_regularization_strength=1.0, exclude=True, name_classifier='LR', name_kernel="linear"): self.normalize = normalize + self.method = method self.ignore_oov = ignore_oov self.do_top5 = do_top5 # self.need_subsample = need_subsample self.normalize = normalize self.size_cv_test = size_cv_test self.set_aprimes_test = set_aprimes_test - self.inverse_regularization_strength = inverse_regularization_strength +# self.inverse_regularization_strength = inverse_regularization_strength self.exclude = exclude self.name_classifier = name_classifier self.name_kernel = name_kernel self.stats = {} - self.cnt_total_correct = 0 - self.cnt_total_total = 0 + # this are some hard-coded bits which will be implemented later self.result_miss = { @@ -79,31 +71,6 @@ def __init__(self, normalize=True, "reason": "missing words" } - @property - def method(self): - return type(self).__name__ - - def normed(self, v): - if self.normalize: - return v - else: - return v / np.linalg.norm(v) - - def get_most_similar_fast(self, v): - scores = self.normed(v) @ self.embs._normalized_matrix.T - scores = (scores + 1) / 2 - return scores - - def get_most_collinear_fast(self, a, ap, b): - scores = np.zeros(self.embs.matrix.shape[0]) - offset_target = ap - a - offset_target = offset_target / np.linalg.norm(offset_target) - m_diff = self.embs.matrix - b - norm = np.linalg.norm(m_diff, axis=1) - norm[norm == 0] = 100500 - m_diff /= norm[:, None] - scores = m_diff @ offset_target - return scores # def is_at_least_one_word_present(self, words): # for w in words: @@ -111,31 +78,6 @@ def get_most_collinear_fast(self, a, ap, b): # return True # return False - def is_pair_missing(self, pairs): - for pair in pairs: - if self.embs.vocabulary.get_id(pair[0]) < 0: - return True - if self.embs.vocabulary.get_id(pair[1][0]) < 0: - return True - # if not is_at_least_one_word_present(pair[1]): - # return True - return False - - def gen_vec_single(self, pairs): - a, a_prime = zip(*pairs) - a_prime = [i[0] for i in a_prime] - # a_prime=[i for sublist in a_prime for i in sublist] - a_prime = [i for i in a_prime if self.embs.vocabulary.get_id(i) >= 0] - a = [i for i in a if self.embs.vocabulary.get_id(i) >= 0] - cnt_noise = len(a) - noise = [random.choice(self.embs.vocabulary.lst_words) for i in range(cnt_noise)] - - if len(a_prime) == 0: - a_prime.append(random.choice(self.embs.vocabulary.lst_words)) - train_vectors = list(a_prime) + list(a) + list(a) + list(a) + list(a) + noise - train_vectors = np.array([self.embs.get_vector(i) for i in train_vectors]) - labels = np.hstack([np.ones(len(a_prime)), np.zeros(len(train_vectors) - len(a_prime))]) - return train_vectors, labels # def gen_vec_single_nonoise(self, pairs): # a, a_prime = zip(*pairs) @@ -146,11 +88,6 @@ def gen_vec_single(self, pairs): # Y = np.hstack([np.ones(len(a_prime)), np.zeros(len(x) - len(a_prime))]) # return X, Y - def get_crowndedness(self, vector): - scores = self.get_most_similar_fast(vector) - scores.sort() - return (scores[-11:-1][::-1]).tolist() - # def create_list_test_right(self, pairs): # global set_aprimes_test # a, a_prime = zip(*pairs) @@ -165,86 +102,9 @@ def get_crowndedness(self, vector): # distances[i] = scores[ids_max[i + 1]] # return distances.mean() - def get_rank(self, source, center): - if isinstance(center, str): - center = self.embs.get_vector(center) - if isinstance(source, str): - source = [source] - scores = self.get_most_similar_fast(center) - ids_max = np.argsort(scores)[::-1] - for i in range(ids_max.shape[0]): - if self.embs.vocabulary.get_word_by_id(ids_max[i]) in source: - break - rank = i - return rank - - @staticmethod - def get_verbose_question(pair_test, pairs_train): - extr = "" - if len(pairs_train) == 1: - extr = "as {} is to {}".format(pairs_train[0][1], pairs_train[0][0]) - res = "What is to {} {}".format(pair_test[0], extr) - return res - - def process_prediction(self, p_test_one, scores, score_reg, score_sim, p_train=[]): - ids_max = np.argsort(scores)[::-1] - result = dict() - cnt_answers_to_report = 6 - set_exclude = set() - if len(p_train) == 1: - set_exclude.update(set([p_train[0][0]]) | set(p_train[0][1])) - - set_exclude.add(p_test_one[0]) - result["question verbose"] = self.get_verbose_question(p_test_one, p_train) - result["b"] = p_test_one[0] - result["expected answer"] = p_test_one[1] - result["predictions"] = [] - result['set_exclude'] = [e for e in set_exclude] - - cnt_reported = 0 - for i in ids_max[:10]: - prediction = dict() - ans = self.embs.vocabulary.get_word_by_id(i) - if self.exclude and (ans in set_exclude): - continue - cnt_reported += 1 - prediction["score"] = float(scores[i]) - prediction["answer"] = ans - if ans in p_test_one[1]: - prediction["hit"] = True - else: - prediction["hit"] = False - result["predictions"].append(prediction) - if cnt_reported >= cnt_answers_to_report: - break - rank = 0 - for i in range(ids_max.shape[0]): - ans = self.embs.vocabulary.get_word_by_id(ids_max[i]) - if self.exclude and (ans in set_exclude): - continue - if ans in p_test_one[1]: - break - rank += 1 - result["rank"] = rank - if rank == 0: - self.cnt_total_correct += 1 - self.cnt_total_total += 1 - # vec_b_prime = self.embs.get_vector(p_test_one[1][0]) - # result["closest words to answer 1"] = get_distance_closest_words(vec_b_prime,1) - # result["closest words to answer 5"] = get_distance_closest_words(vec_b_prime,5) - # where prediction lands: - ans = self.embs.vocabulary.get_word_by_id(ids_max[0]) - result["landing_b"] = (ans == p_test_one[0]) - result["landing_b_prime"] = (ans in p_test_one[1]) - all_a = [i[0] for i in p_train] - all_a_prime = [item for sublist in p_train for item in sublist[1]] - result["landing_a"] = (ans in all_a) - result["landing_a_prime"] = (ans in all_a_prime) - return result def run_category(self, pairs): - self.cnt_total_correct = 0 - self.cnt_total_total = 0 + details = [] kfold = sklearn.model_selection.KFold(n_splits=len(pairs) // self.size_cv_test) cnt_splits = kfold.get_n_splits(pairs) @@ -278,23 +138,26 @@ def run_category(self, pairs): # p_train = [x for x in p_train if not is_pair_missing(x)] cnt += 1 my_prog.update(cnt) - details += self.do_test_on_pairs(p_train, p_test) + details += self.solver.do_test_on_pairs(p_train, p_test) out = dict() out["details"] = details results = {} - if self.cnt_total_total == 0: + # TODO: move this logic to solver + results["cnt_questions_correct"] = self.solver.cnt_total_correct + results["cnt_questions_total"] = self.solver.cnt_total_total + if self.solver.cnt_total_total == 0: results["accuracy"] = -1 else: - results["accuracy"] = self.cnt_total_correct / self.cnt_total_total - results["cnt_questions_correct"] = self.cnt_total_correct - results["cnt_questions_total"] = self.cnt_total_total + results["accuracy"] = self.solver.cnt_total_correct / self.solver.cnt_total_total out["result"] = results # str_results = json.dumps(jsonify(out), indent=4, separators=(',', ': '), sort_keys=True) return out def run(self, embs, path_dataset): # group_subcategory self.embs = embs + self.solver = select_method(self.method)(self.embs, exclude=self.exclude) + if self.normalize: self.embs.normalize() @@ -370,192 +233,4 @@ def get_result(self, embeddings, path_dataset): # , group_subcategory=False return results -class PairWise(Analogy): - def do_test_on_pairs(self, pairs_train, pairs_test): - results = [] - for p_train, p_test in product(pairs_train, pairs_test): - if self.is_pair_missing([p_train, p_test]): - self.cnt_total_total += 1 - result = {} - result["rank"] = -1 - result["question verbose"] = self.get_verbose_question(p_test, [p_train]) - # todo: report which exaclt words are missing - else: - result = self.do_on_two_pairs(p_train, p_test) - result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0]) - result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0]) - results.append(result) - return results - def do_on_two_pairs(self, p_train, p_test): - vec_a = self.embs.get_vector(p_train[0]) - vec_a_prime = self.embs.get_vector(p_train[1][0]) - vec_b = self.embs.get_vector(p_test[0]) - vec_b_prime = self.embs.get_vector(p_test[1][0]) - if scipy.sparse.issparse(self.embs.matrix): - vec_a = vec_a.toarray()[0] - vec_a_prime = vec_a_prime.toarray()[0] - vec_b = vec_b.toarray()[0] - - scores, vec_b_prime_predicted = self.compute_scores(vec_a, vec_a_prime, vec_b) - # ids_max = np.argsort(scores)[::-1] - result = self.process_prediction(p_test, scores, None, None, [p_train]) - self.collect_stats(result, vec_a, vec_a_prime, vec_b, vec_b_prime, vec_b_prime_predicted) - return result - - def collect_stats(self, result, vec_a, vec_a_prime, vec_b, vec_b_prime, vec_b_prime_predicted): - if vec_b_prime_predicted is not None: - result["similarity predicted to b_prime cosine"] = float( - self.embs.cmp_vectors(vec_b_prime_predicted, vec_b_prime)) - - result["similarity a to a_prime cosine"] = float(self.embs.cmp_vectors(vec_a, vec_a_prime)) - result["similarity a_prime to b_prime cosine"] = float(self.embs.cmp_vectors(vec_a_prime, vec_b_prime)) - result["similarity b to b_prime cosine"] = float(self.embs.cmp_vectors(vec_b, vec_b_prime)) - result["similarity a to b_prime cosine"] = float(self.embs.cmp_vectors(vec_a, vec_b_prime)) - - result["distance a to a_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_a, vec_a_prime)) - result["distance a_prime to b_prime euclidean"] = float( - scipy.spatial.distance.euclidean(vec_a_prime, vec_b_prime)) - result["distance b to b_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_b, vec_b_prime)) - result["distance a to b_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_a, vec_b_prime)) - - result["crowdedness of b_prime"] = self.get_crowndedness(vec_b_prime) - - -class LinearOffset(PairWise): - def compute_scores(self, vec_a, vec_a_prime, vec_b): - vec_b_prime_predicted = vec_a_prime - vec_a + vec_b - vec_b_prime_predicted = self.normed(vec_b_prime_predicted) - scores = self.get_most_similar_fast(vec_b_prime_predicted) - return scores, vec_b_prime_predicted - - -class PairDistance(PairWise): - def compute_scores(self, vec_a, vec_a_prime, vec_b): - scores = self.get_most_collinear_fast(vec_a, vec_a_prime, vec_b) - return scores, None - - -class ThreeCosMul(PairWise): - def compute_scores(self, vec_a, vec_a_prime, vec_b): - epsilon = 0.001 - sim_a = self.get_most_similar_fast(vec_a) - sim_a_prime = self.get_most_similar_fast(vec_a_prime) - sim_b = self.get_most_similar_fast(vec_b) - scores = (sim_a_prime * sim_b) / (sim_a + epsilon) - return scores, None - - -class ThreeCosMul2(PairWise): - def compute_scores(self, vec_a, vec_a_prime, vec_b): - epsilon = 0.001 - # sim_a = get_most_similar_fast(vec_a) - # sim_a_prime = get_most_similar_fast(vec_a_prime) - # sim_b = get_most_similar_fast(vec_b) - # scores = (sim_a_prime * sim_b) / (sim_a + epsilon) - predicted = (((vec_a_prime + 0.5) / 2) * ((vec_b + 0.5) / 2)) / (((vec_a + 0.5) / 2) + epsilon) - scores = self.get_most_similar_fast(predicted) - return scores, predicted - - -# class SimilarToAny(PairWise): -# def compute_scores(self, vectors): -# scores = self.get_most_similar_fast(vectors) -# best = scores.max(axis=0) -# return best -# -# -# class SimilarToB(Analogy): -# def do_test_on_pairs(self, pairs_train, pairs_test): -# results = [] -# for p_test in pairs_test: -# if self.is_pair_missing([p_test]): -# continue -# result = self.do_on_two_pairs(p_test) -# result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0]) -# result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0]) -# results.append(result) -# return results -# -# def do_on_two_pairs(self, pair_test): -# if self.is_pair_missing([pair_test]): -# result = self.result_miss -# else: -# vec_b = self.embs.get_vector(pair_test[0]) -# vec_b_prime = self.embs.get_vector(pair_test[1][0]) -# scores = self.get_most_similar_fast(vec_b) -# result = self.process_prediction(pair_test, scores, None, None) -# result["similarity to correct cosine"] = self.embs.cmp_vectors(vec_b, vec_b_prime) -# return result - - -class ThreeCosAvg(Analogy): - - def do_test_on_pairs(self, p_train, p_test): - vecs_a = [] - vecs_a_prime = [] - for pair in p_train: - if self.is_pair_missing([pair]): - continue - vecs_a_prime_local = [] - for token in pair[1]: - if self.embs.vocabulary.get_id(token) >= 0: - vecs_a_prime_local.append(self.embs.get_vector(token)) - break - if len(vecs_a_prime_local) > 0: - vecs_a.append(self.embs.get_vector(pair[0])) - vecs_a_prime.append(np.vstack(vecs_a_prime_local).mean(axis=0)) - if len(vecs_a_prime) == 0: - print("AAAA SOMETHIGN MISSING") - return ([]) - - vec_a = np.vstack(vecs_a).mean(axis=0) - vec_a_prime = np.vstack(vecs_a_prime).mean(axis=0) - - results = [] - for p_test_one in p_test: - if self.is_pair_missing([p_test_one]): - continue - vec_b_prime = self.embs.get_vector(p_test_one[1][0]) - vec_b = self.embs.get_vector(p_test_one[0]) - vec_b_prime_predicted = vec_a_prime - vec_a + vec_b - # oh crap, why are we not normalizing here? - scores = self.get_most_similar_fast(vec_b_prime_predicted) - result = self.process_prediction(p_test_one, scores, None, None) - result["distances to correct cosine"] = self.embs.cmp_vectors(vec_b_prime_predicted, vec_b_prime) - results.append(result) - return results - - -class LRCos(Analogy): - - def do_test_on_pairs(self, p_train, p_test): - results = [] - X_train, Y_train = self.gen_vec_single(p_train) - if self.name_classifier.startswith("LR"): - # model_regression = LogisticRegression(class_weight = 'balanced') - # model_regression = Pipeline([('poly', PolynomialFeatures(degree=3)), ('logistic', LogisticRegression(class_weight = 'balanced',C=C))]) - model_regression = LogisticRegression( - class_weight='balanced', - C=self.inverse_regularization_strength) - if self.name_classifier == "SVM": - model_regression = sklearn.svm.SVC( - kernel=self.name_kernel, - cache_size=1000, - class_weight='balanced', - probability=True) - model_regression.fit(X_train, Y_train) - score_reg = model_regression.predict_proba(self.embs.matrix)[:, 1] - for p_test_one in p_test: - if self.is_pair_missing([p_test_one]): - # file_out.write("{}\t{}\t{}\n".format(p_test_one[0],p_test_one[1],"MISSING")) - continue - vec_b = self.embs.get_vector(p_test_one[0]) - vec_b_normed = vec_b / np.linalg.norm(vec_b) - score_sim = vec_b_normed @ self.embs._normalized_matrix.T - scores = score_sim * score_reg - result = self.process_prediction(p_test_one, scores, score_reg, score_sim) - vec_b_prime = self.embs.get_vector(p_test_one[1][0]) - result["similarity b to b_prime cosine"] = float(self.embs.cmp_vectors(vec_b, vec_b_prime)) - results.append(result) - return results diff --git a/vecto/benchmarks/analogy/io.py b/vecto/benchmarks/analogy/io.py new file mode 100644 index 00000000..c6c06adc --- /dev/null +++ b/vecto/benchmarks/analogy/io.py @@ -0,0 +1,27 @@ +def get_pairs(fname): + pairs = [] + with open(fname) as file_in: + id_line = 0 + for line in file_in: + if line.strip() == '': + continue + try: + id_line += 1 + if "\t" in line: + parts = line.lower().split("\t") + else: + parts = line.lower().split() + left = parts[0] + right = parts[1] + right = right.strip() + if "/" in right: + right = [i.strip() for i in right.split("/")] + else: + right = [i.strip() for i in right.split(",")] + pairs.append([left, right]) + except: + print("error reading pairs") + print("in file", fname) + print("in line", id_line, line) + exit(-1) + return pairs diff --git a/vecto/benchmarks/analogy/solvers.py b/vecto/benchmarks/analogy/solvers.py new file mode 100644 index 00000000..f1bbaefc --- /dev/null +++ b/vecto/benchmarks/analogy/solvers.py @@ -0,0 +1,351 @@ +import random +import scipy +import numpy as np +from sklearn.linear_model import LogisticRegression +from itertools import product + + +class Solver: + def __init__(self, + embs, + exclude, + name_classifier='LR', + name_kernel="linear", + inverse_regularization_strength=1.0, + ): + self.embs = embs + self.name_classifier = name_classifier + self.name_kernel = name_kernel + self.inverse_regularization_strength = inverse_regularization_strength + self.exclude = exclude + self.cnt_total_correct = 0 + self.cnt_total_total = 0 + + @property + def method(self): + return type(self).__name__ + + def normed(self, v): + # if self.normalize: + # return v + # else: + return v / np.linalg.norm(v) + + # TODO: move this to embeddings module + def get_crowndedness(self, vector): + scores = self.get_most_similar_fast(vector) + scores.sort() + return (scores[-11:-1][::-1]).tolist() + + def get_most_similar_fast(self, v): + scores = self.normed(v) @ self.embs._normalized_matrix.T + scores = (scores + 1) / 2 + return scores + + def get_most_collinear_fast(self, a, ap, b): + scores = np.zeros(self.embs.matrix.shape[0]) + offset_target = ap - a + offset_target = offset_target / np.linalg.norm(offset_target) + m_diff = self.embs.matrix - b + norm = np.linalg.norm(m_diff, axis=1) + norm[norm == 0] = 100500 + m_diff /= norm[:, None] + scores = m_diff @ offset_target + return scores + + def gen_vec_single(self, pairs): + a, a_prime = zip(*pairs) + a_prime = [i[0] for i in a_prime] + # a_prime=[i for sublist in a_prime for i in sublist] + a_prime = [i for i in a_prime if self.embs.vocabulary.get_id(i) >= 0] + a = [i for i in a if self.embs.vocabulary.get_id(i) >= 0] + cnt_noise = len(a) + noise = [random.choice(self.embs.vocabulary.lst_words) for i in range(cnt_noise)] + + if len(a_prime) == 0: + a_prime.append(random.choice(self.embs.vocabulary.lst_words)) + train_vectors = list(a_prime) + list(a) + list(a) + list(a) + list(a) + noise + train_vectors = np.array([self.embs.get_vector(i) for i in train_vectors]) + labels = np.hstack([np.ones(len(a_prime)), np.zeros(len(train_vectors) - len(a_prime))]) + return train_vectors, labels + + def is_pair_missing(self, pairs): + for pair in pairs: + if self.embs.vocabulary.get_id(pair[0]) < 0: + return True + if self.embs.vocabulary.get_id(pair[1][0]) < 0: + return True + # if not is_at_least_one_word_present(pair[1]): + # return True + return False + + def get_rank(self, source, center): + if isinstance(center, str): + center = self.embs.get_vector(center) + if isinstance(source, str): + source = [source] + scores = self.get_most_similar_fast(center) + ids_max = np.argsort(scores)[::-1] + for i in range(ids_max.shape[0]): + if self.embs.vocabulary.get_word_by_id(ids_max[i]) in source: + break + rank = i + return rank + + @staticmethod + def get_verbose_question(pair_test, pairs_train): + extr = "" + if len(pairs_train) == 1: + extr = "as {} is to {}".format(pairs_train[0][1], pairs_train[0][0]) + res = "What is to {} {}".format(pair_test[0], extr) + return res + + def process_prediction(self, p_test_one, scores, score_reg, score_sim, p_train=[]): + ids_max = np.argsort(scores)[::-1] + result = dict() + cnt_answers_to_report = 6 + set_exclude = set() + if len(p_train) == 1: + set_exclude.update(set([p_train[0][0]]) | set(p_train[0][1])) + + set_exclude.add(p_test_one[0]) + result["question verbose"] = self.get_verbose_question(p_test_one, p_train) + result["b"] = p_test_one[0] + result["expected answer"] = p_test_one[1] + result["predictions"] = [] + result['set_exclude'] = [e for e in set_exclude] + + cnt_reported = 0 + for i in ids_max[:10]: + prediction = dict() + ans = self.embs.vocabulary.get_word_by_id(i) + if self.exclude and (ans in set_exclude): + continue + cnt_reported += 1 + prediction["score"] = float(scores[i]) + prediction["answer"] = ans + if ans in p_test_one[1]: + prediction["hit"] = True + else: + prediction["hit"] = False + result["predictions"].append(prediction) + if cnt_reported >= cnt_answers_to_report: + break + rank = 0 + for i in range(ids_max.shape[0]): + ans = self.embs.vocabulary.get_word_by_id(ids_max[i]) + if self.exclude and (ans in set_exclude): + continue + if ans in p_test_one[1]: + break + rank += 1 + result["rank"] = rank + if rank == 0: + self.cnt_total_correct += 1 + self.cnt_total_total += 1 + # vec_b_prime = self.embs.get_vector(p_test_one[1][0]) + # result["closest words to answer 1"] = get_distance_closest_words(vec_b_prime,1) + # result["closest words to answer 5"] = get_distance_closest_words(vec_b_prime,5) + # where prediction lands: + ans = self.embs.vocabulary.get_word_by_id(ids_max[0]) + result["landing_b"] = (ans == p_test_one[0]) + result["landing_b_prime"] = (ans in p_test_one[1]) + all_a = [i[0] for i in p_train] + all_a_prime = [item for sublist in p_train for item in sublist[1]] + result["landing_a"] = (ans in all_a) + result["landing_a_prime"] = (ans in all_a_prime) + return result + + +class PairWise(Solver): + def do_test_on_pairs(self, pairs_train, pairs_test): + results = [] + for p_train, p_test in product(pairs_train, pairs_test): + if self.is_pair_missing([p_train, p_test]): + self.cnt_total_total += 1 + result = {} + result["rank"] = -1 + result["question verbose"] = self.get_verbose_question(p_test, [p_train]) + # todo: report which exaclt words are missing + else: + result = self.do_on_two_pairs(p_train, p_test) + result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0]) + result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0]) + results.append(result) + return results + + def do_on_two_pairs(self, p_train, p_test): + vec_a = self.embs.get_vector(p_train[0]) + vec_a_prime = self.embs.get_vector(p_train[1][0]) + vec_b = self.embs.get_vector(p_test[0]) + vec_b_prime = self.embs.get_vector(p_test[1][0]) + if scipy.sparse.issparse(self.embs.matrix): + vec_a = vec_a.toarray()[0] + vec_a_prime = vec_a_prime.toarray()[0] + vec_b = vec_b.toarray()[0] + + scores, vec_b_prime_predicted = self.compute_scores(vec_a, vec_a_prime, vec_b) + # ids_max = np.argsort(scores)[::-1] + result = self.process_prediction(p_test, scores, None, None, [p_train]) + self.collect_stats(result, vec_a, vec_a_prime, vec_b, vec_b_prime, vec_b_prime_predicted) + return result + + def collect_stats(self, result, vec_a, vec_a_prime, vec_b, vec_b_prime, vec_b_prime_predicted): + if vec_b_prime_predicted is not None: + result["similarity predicted to b_prime cosine"] = float( + self.embs.cmp_vectors(vec_b_prime_predicted, vec_b_prime)) + + result["similarity a to a_prime cosine"] = float(self.embs.cmp_vectors(vec_a, vec_a_prime)) + result["similarity a_prime to b_prime cosine"] = float(self.embs.cmp_vectors(vec_a_prime, vec_b_prime)) + result["similarity b to b_prime cosine"] = float(self.embs.cmp_vectors(vec_b, vec_b_prime)) + result["similarity a to b_prime cosine"] = float(self.embs.cmp_vectors(vec_a, vec_b_prime)) + + result["distance a to a_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_a, vec_a_prime)) + result["distance a_prime to b_prime euclidean"] = float( + scipy.spatial.distance.euclidean(vec_a_prime, vec_b_prime)) + result["distance b to b_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_b, vec_b_prime)) + result["distance a to b_prime euclidean"] = float(scipy.spatial.distance.euclidean(vec_a, vec_b_prime)) + + result["crowdedness of b_prime"] = self.get_crowndedness(vec_b_prime) + + +class LinearOffset(PairWise): + def compute_scores(self, vec_a, vec_a_prime, vec_b): + vec_b_prime_predicted = vec_a_prime - vec_a + vec_b + vec_b_prime_predicted = self.normed(vec_b_prime_predicted) + scores = self.get_most_similar_fast(vec_b_prime_predicted) + return scores, vec_b_prime_predicted + + +class PairDistance(PairWise): + def compute_scores(self, vec_a, vec_a_prime, vec_b): + scores = self.get_most_collinear_fast(vec_a, vec_a_prime, vec_b) + return scores, None + + +class ThreeCosMul(PairWise): + def compute_scores(self, vec_a, vec_a_prime, vec_b): + epsilon = 0.001 + sim_a = self.get_most_similar_fast(vec_a) + sim_a_prime = self.get_most_similar_fast(vec_a_prime) + sim_b = self.get_most_similar_fast(vec_b) + scores = (sim_a_prime * sim_b) / (sim_a + epsilon) + return scores, None + + +class ThreeCosMul2(PairWise): + def compute_scores(self, vec_a, vec_a_prime, vec_b): + epsilon = 0.001 + # sim_a = get_most_similar_fast(vec_a) + # sim_a_prime = get_most_similar_fast(vec_a_prime) + # sim_b = get_most_similar_fast(vec_b) + # scores = (sim_a_prime * sim_b) / (sim_a + epsilon) + predicted = (((vec_a_prime + 0.5) / 2) * ((vec_b + 0.5) / 2)) / (((vec_a + 0.5) / 2) + epsilon) + scores = self.get_most_similar_fast(predicted) + return scores, predicted + + +class SimilarToAny(PairWise): + def compute_scores(self, *vectors): + vectors = np.array(vectors) + scores = self.get_most_similar_fast(vectors) + best = scores.max(axis=0) + return best, None + + +class SimilarToB(PairWise): + def do_test_on_pairs(self, pairs_train, pairs_test): + results = [] + for p_test in pairs_test: + if self.is_pair_missing([p_test]): + continue + # TODO: try to reuse more from pairwise + result = self.do_on_two_pair(p_test) + result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0]) + result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0]) + results.append(result) + return results + + def do_on_two_pair(self, pair_test): + if self.is_pair_missing([pair_test]): + result = self.result_miss + else: + vec_b = self.embs.get_vector(pair_test[0]) + vec_b_prime = self.embs.get_vector(pair_test[1][0]) + scores = self.get_most_similar_fast(vec_b) + result = self.process_prediction(pair_test, scores, None, None) + result["similarity to correct cosine"] = self.embs.cmp_vectors(vec_b, vec_b_prime) + return result + + +class ThreeCosAvg(Solver): + + def do_test_on_pairs(self, p_train, p_test): + vecs_a = [] + vecs_a_prime = [] + for pair in p_train: + if self.is_pair_missing([pair]): + continue + vecs_a_prime_local = [] + for token in pair[1]: + if self.embs.vocabulary.get_id(token) >= 0: + vecs_a_prime_local.append(self.embs.get_vector(token)) + break + if len(vecs_a_prime_local) > 0: + vecs_a.append(self.embs.get_vector(pair[0])) + vecs_a_prime.append(np.vstack(vecs_a_prime_local).mean(axis=0)) + if len(vecs_a_prime) == 0: + print("AAAA SOMETHIGN MISSING") + return ([]) + + vec_a = np.vstack(vecs_a).mean(axis=0) + vec_a_prime = np.vstack(vecs_a_prime).mean(axis=0) + + results = [] + for p_test_one in p_test: + if self.is_pair_missing([p_test_one]): + continue + vec_b_prime = self.embs.get_vector(p_test_one[1][0]) + vec_b = self.embs.get_vector(p_test_one[0]) + vec_b_prime_predicted = vec_a_prime - vec_a + vec_b + # oh crap, why are we not normalizing here? + scores = self.get_most_similar_fast(vec_b_prime_predicted) + result = self.process_prediction(p_test_one, scores, None, None) + result["distances to correct cosine"] = self.embs.cmp_vectors(vec_b_prime_predicted, vec_b_prime) + results.append(result) + return results + + +class LRCos(Solver): + + def do_test_on_pairs(self, p_train, p_test): + results = [] + X_train, Y_train = self.gen_vec_single(p_train) + if self.name_classifier.startswith("LR"): + # model_regression = LogisticRegression(class_weight = 'balanced') + # model_regression = Pipeline([('poly', PolynomialFeatures(degree=3)), ('logistic', LogisticRegression(class_weight = 'balanced',C=C))]) + model_regression = LogisticRegression( + solver="liblinear", + class_weight='balanced', + C=self.inverse_regularization_strength) + if self.name_classifier == "SVM": + model_regression = sklearn.svm.SVC( + kernel=self.name_kernel, + cache_size=1000, + class_weight='balanced', + probability=True) + model_regression.fit(X_train, Y_train) + score_reg = model_regression.predict_proba(self.embs.matrix)[:, 1] + for p_test_one in p_test: + if self.is_pair_missing([p_test_one]): + # file_out.write("{}\t{}\t{}\n".format(p_test_one[0],p_test_one[1],"MISSING")) + continue + vec_b = self.embs.get_vector(p_test_one[0]) + vec_b_normed = vec_b / np.linalg.norm(vec_b) + score_sim = vec_b_normed @ self.embs._normalized_matrix.T + scores = score_sim * score_reg + result = self.process_prediction(p_test_one, scores, score_reg, score_sim) + vec_b_prime = self.embs.get_vector(p_test_one[1][0]) + result["similarity b to b_prime cosine"] = float(self.embs.cmp_vectors(vec_b, vec_b_prime)) + results.append(result) + return results diff --git a/vecto/benchmarks/base.py b/vecto/benchmarks/base.py index b16142fe..5e1f43e5 100644 --- a/vecto/benchmarks/base.py +++ b/vecto/benchmarks/base.py @@ -1,5 +1,9 @@ import abc +import os from vecto.utils.metadata import WithMetaData +from vecto.embeddings import load_from_dir +from vecto.utils.data import save_json, print_json +from vecto.utils import get_time_str class Benchmark(): @@ -12,3 +16,23 @@ def __init__(self): @abc.abstractmethod def get_result(self, embeddings, path_dataset): raise NotImplementedError + + def run_with_args(self, args): + embeddings = load_from_dir(args.embeddings) + print("SHAPE:", embeddings.matrix.shape) + results = self.get_result(embeddings, args.dataset) + if args.path_out: + if os.path.isdir(args.path_out) or args.path_out.endswith("/"): + dataset = os.path.basename(os.path.normpath(args.dataset)) + timestamp = get_time_str() + task = results[0]["experiment_setup"]["task"] + name_file_out = os.path.join(args.path_out, + task, + dataset, + timestamp, + "results.json") + save_json(results, name_file_out) + else: + save_json(results, args.path_out) + else: + print_json(results) diff --git a/vecto/benchmarks/categorization/__init__.py b/vecto/benchmarks/categorization/__init__.py index 130c9189..26e515ba 100644 --- a/vecto/benchmarks/categorization/__init__.py +++ b/vecto/benchmarks/categorization/__init__.py @@ -1 +1,42 @@ -from .categorization import * \ No newline at end of file +import argparse +from .categorization import * +from vecto.embeddings import load_from_dir +from vecto.utils.data import save_json, print_json +from vecto.utils import get_time_str + + +def select_method(key): + options = {} + if key == 'SpectralCategorization': + method = SpectralCategorization(options) + if key == 'KMeansCategorization': + method = KMeansCategorization(options) + else: + raise RuntimeError('The method name was not recognized.') + return method + + +def run(options, extra_args): + parser = argparse.ArgumentParser() + parser.add_argument('embeddings') + parser.add_argument('dataset') + parser.add_argument('--method', help='Categorization method', default='KMeansCategorization') + parser.add_argument('--path_out', help='Destination folder to save the results') + args = parser.parse_args(extra_args) + embeddings = load_from_dir(args.embeddings) + benchmark = select_method(args.method) + results = benchmark.get_result(embeddings, args.dataset) + if args.path_out: + if path.isdir(args.path_out) or args.path_out.endswith('/'): + dataset = path.basename(path.normpath(args.dataset)) + timestamp = get_time_str() + name_file_out = path.join(args.path_out, + dataset, + args.method, + timestamp, + 'results.json') + save_json(results, name_file_out) + else: + save_json(results, args.path_out) + else: + print_json(results) diff --git a/vecto/benchmarks/categorization/__main__.py b/vecto/benchmarks/categorization/__main__.py deleted file mode 100644 index f99330a0..00000000 --- a/vecto/benchmarks/categorization/__main__.py +++ /dev/null @@ -1,50 +0,0 @@ -import argparse -import json -import logging - -from vecto.utils.data import save_json -from vecto.benchmarks.categorization import * -from vecto.embeddings import load_from_dir -from vecto.config import load_config - -logging.basicConfig(level=logging.DEBUG) - - -def print_json(data): - print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) - - -def select_method(key): - options = {} - if key == 'SpectralCategorization': - method = SpectralCategorization(options) - if key == 'KMeansCategorization': - method = KMeansCategorization(options) - else: - raise RuntimeError('The method name was not recognized.') - return method - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument('embeddings') - parser.add_argument('dataset') - parser.add_argument('--method', help='Categorization method', default='KMeansCategorization') - parser.add_argument('--path_out', help='Destination folder to save the results') - args = parser.parse_args() - embeddings = load_from_dir(args.embeddings) - benchmark = select_method(args.method) - results = benchmark.get_result(embeddings, args.dataset) - if args.path_out: - if path.isdir(args.path_out) or args.path_out.endswith('/'): - dataset = path.basename(path.normpath(args.dataset)) - name_file_out = path.join(args.path_out, dataset, args.method, 'results.json') - save_json(results, name_file_out) - else: - save_json(results, args.path_out) - else: - print_json(results) - - -if __name__ == '__main__': - main() diff --git a/vecto/benchmarks/sequence_labeling/__init__.py b/vecto/benchmarks/sequence_labeling/__init__.py index 81f38e49..4ad498cb 100644 --- a/vecto/benchmarks/sequence_labeling/__init__.py +++ b/vecto/benchmarks/sequence_labeling/__init__.py @@ -1 +1,17 @@ -from .sequence_labeling import Sequence_labeling \ No newline at end of file +import argparse +from .sequence_labeling import Sequence_labeling + + +def run(options, extra_args): + + parser = argparse.ArgumentParser() + parser.add_argument("embeddings") + parser.add_argument("dataset") + parser.add_argument("--window_size", default=5, type=int) + parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'], + help='name of method') + parser.add_argument('--normalize', dest='normalize', action='store_true') + parser.add_argument("--path_out", default=False, help="destination folder to save results") + args = parser.parse_args(extra_args) + sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size) + sequence_labeling.run_with_args(args) diff --git a/vecto/benchmarks/sequence_labeling/__main__.py b/vecto/benchmarks/sequence_labeling/__main__.py index 19ff1ecd..02c2364e 100644 --- a/vecto/benchmarks/sequence_labeling/__main__.py +++ b/vecto/benchmarks/sequence_labeling/__main__.py @@ -1,45 +1,6 @@ -import argparse -import json -import logging -import os from vecto.utils.data import save_json from vecto.benchmarks.sequence_labeling import Sequence_labeling from vecto.embeddings import load_from_dir -logging.basicConfig(level=logging.DEBUG) - -def print_json(data): - print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) - - -def main(): - # config = load_config() - # print(config) - parser = argparse.ArgumentParser() - parser.add_argument("embeddings") - parser.add_argument("dataset") - parser.add_argument("--window_size", default=5, type=int) - parser.add_argument("--method", default='lr', choices=['lr', '2FFNN'], - help='name of method') - parser.add_argument('--normalize', dest='normalize', action='store_true') - parser.add_argument("--path_out", default=False, help="destination folder to save results") - args = parser.parse_args() - embeddings = load_from_dir(args.embeddings) - # print("embeddings", embeddings) - sequence_labeling = Sequence_labeling(normalize=args.normalize, method=args.method, window_size=args.window_size) - results = sequence_labeling.get_result(embeddings, args.dataset) - if args.path_out: - if os.path.isdir(args.path_out) or args.path_out.endswith("/"): - dataset = os.path.basename(os.path.normpath(args.dataset)) - name_file_out = os.path.join(args.path_out, dataset, "results.json") - save_json(results, name_file_out) - else: - save_json(results, args.path_out) - else: - print_json(results) - - -if __name__ == "__main__": - main() diff --git a/vecto/benchmarks/sequence_labeling/sequence_labeling.py b/vecto/benchmarks/sequence_labeling/sequence_labeling.py index e45c406d..0f5500c3 100644 --- a/vecto/benchmarks/sequence_labeling/sequence_labeling.py +++ b/vecto/benchmarks/sequence_labeling/sequence_labeling.py @@ -109,7 +109,7 @@ def getX(self, input, m): x = [] OOV_count = 0 token_count = 0 - print(m.matrix.shape[0]) + # print(m.matrix.shape[0]) random_vector = m.matrix.sum(axis=0) / m.matrix.shape[0] # random_vector = m.matrix[0] for wordList in input: @@ -155,7 +155,7 @@ def run_lr(self, embeddings, my_train_x, my_train_y, my_test_x, my_test_y, metho print(idx2label) # fit LR classifier if method == 'lr': - lrc = LogisticRegression() + lrc = LogisticRegression(solver="liblinear") if method == '2FFNN': lrc = MLPClassifier() @@ -202,7 +202,7 @@ def run_lr(self, embeddings, my_train_x, my_train_y, my_test_x, my_test_y, metho out['details']['pred_test'] = pred_test return out - def run(self, embs, path_dataset): + def _run(self, embs, path_dataset): # specify the task (can be ner, pos or chunk) task = os.path.basename(path_dataset) @@ -237,10 +237,10 @@ def run(self, embs, path_dataset): out = self.run_lr(embs, my_train_x, my_train_y, my_test_x, my_test_y, self.method, idx2label, dataset, task) if self.method == 'crf': - # todo + # TODO: implement pass if self.method == 'lstm': - # todo + # TODO: implement pass experiment_setup = dict() @@ -264,4 +264,4 @@ def run(self, embs, path_dataset): def get_result(self, embeddings, path_dataset): if self.normalize: embeddings.normalize() - return [self.run(embeddings, path_dataset)] + return [self._run(embeddings, path_dataset)] diff --git a/vecto/benchmarks/similarity/__init__.py b/vecto/benchmarks/similarity/__init__.py index 36f8d38f..93c450c5 100644 --- a/vecto/benchmarks/similarity/__init__.py +++ b/vecto/benchmarks/similarity/__init__.py @@ -1 +1,14 @@ +import argparse from .similarity import Similarity + + +def run(options, extra_args): + parser = argparse.ArgumentParser() + parser.add_argument("embeddings") + parser.add_argument("dataset") + parser.add_argument('--normalize', dest='normalize', action='store_true') + parser.add_argument('--ignore_oov', dest='ignore_oov', action='store_true') + parser.add_argument("--path_out", default=False, help="destination folder to save results") + args = parser.parse_args(extra_args) + similarity = Similarity(normalize=args.normalize, ignore_oov=args.ignore_oov) + similarity.run_with_args(args) diff --git a/vecto/benchmarks/similarity/__main__.py b/vecto/benchmarks/similarity/__main__.py deleted file mode 100644 index 5a69554b..00000000 --- a/vecto/benchmarks/similarity/__main__.py +++ /dev/null @@ -1,44 +0,0 @@ -import argparse -import json -import logging -import os - -from vecto.utils.data import save_json -from vecto.benchmarks.similarity import Similarity -from vecto.embeddings import load_from_dir - -logging.basicConfig(level=logging.DEBUG) - - -def print_json(data): - print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) - - -def main(): - # config = load_config() - # print(config) - parser = argparse.ArgumentParser() - parser.add_argument("embeddings") - parser.add_argument("dataset") - parser.add_argument('--normalize', dest='normalize', action='store_true') - parser.add_argument('--ignore_oov', dest='ignore_oov', action='store_true') - parser.add_argument("--path_out", default=False, help="destination folder to save results") - args = parser.parse_args() - embeddings = load_from_dir(args.embeddings) - # print("embeddings", embeddings) - # print(args.normalize) - similarity = Similarity(normalize=args.normalize, ignore_oov=args.ignore_oov) - results = similarity.get_result(embeddings, args.dataset) - if args.path_out: - if os.path.isdir(args.path_out) or args.path_out.endswith("/"): - dataset = os.path.basename(os.path.normpath(args.dataset)) - name_file_out = os.path.join(args.path_out, dataset, "results.json") - save_json(results, name_file_out) - else: - save_json(results, args.path_out) - else: - print_json(results) - - -if __name__ == "__main__": - main() diff --git a/vecto/benchmarks/text_classification/text_classification.py b/vecto/benchmarks/text_classification/text_classification.py index 0991c26f..3c2d1cc8 100644 --- a/vecto/benchmarks/text_classification/text_classification.py +++ b/vecto/benchmarks/text_classification/text_classification.py @@ -94,6 +94,7 @@ def __init__(self, batchsize=64, epoch=5, gpu=-1, layer=1, dropout=0, model=['cn self.char_based = char_based self.shrink = shrink + # TODO: let all benchmarks set output path in init def get_result(self, embeddings, path_dataset, path_output='/tmp/text_classification/'): self.out = path_output self.unit = embeddings.matrix.shape[1] diff --git a/vecto/benchmarks/visualize.py b/vecto/benchmarks/visualize.py index d6035368..28577166 100644 --- a/vecto/benchmarks/visualize.py +++ b/vecto/benchmarks/visualize.py @@ -1,16 +1,28 @@ +import logging import os import pandas from pandas.io.json import json_normalize from vecto.utils.data import load_json +logger = logging.getLogger(__name__) + + def df_from_file(path): data = load_json(path) - meta = [["experiment_setup", "subcategory"], ["experiment_setup", "method"], ["experiment_setup", "embeddings"]] + meta = [["experiment_setup", "task"], + ["experiment_setup", "subcategory"], + ["experiment_setup", "method"], + ["experiment_setup", "embeddings"]] dframe = json_normalize(data, meta=meta) if "details" in dframe: dframe.drop("details", axis="columns", inplace=True) - dframe["result"] = dframe["result." + dframe["experiment_setup.default_measurement"].unique()[0]] + default_measurement = "accuracy" + try: + default_measurement = dframe["experiment_setup.default_measurement"].unique()[0] + except: + logger.warning(f"default_measurement not specified in {path}") + dframe["result"] = dframe["result." + default_measurement] # df["reciprocal_rank"] = 1 / (df["rank"] + 1) return dframe @@ -19,8 +31,9 @@ def df_from_dir(path): dfs = [] for (dirpath, _, filenames) in os.walk(path): for filename in filenames: - dfs.append(df_from_file(os.path.join(dirpath, filename))) - dframe = pandas.concat(dfs) + if filename.endswith(".json"): + dfs.append(df_from_file(os.path.join(dirpath, filename))) + dframe = pandas.concat(dfs, sort=True) return dframe @@ -41,3 +54,11 @@ def plot_accuracy(path, key_primary="experiment_setup.method", key_secondary="experiment_setup.subcategory"): unstacked = get_filtered_dataframe(path, key_primary, key_secondary) unstacked.plot.bar(rot=0) + + +if __name__ == "__main__": + plot_accuracy("/mnt/work/scratch", + key_primary="experiment_setup.task", + key_secondary="experiment_setup.embeddings.name") + from matplotlib import pyplot as plt + plt.savefig("results.pdf", bbox_inches="tight") diff --git a/vecto/cli.py b/vecto/cli.py new file mode 100644 index 00000000..ff6eb933 --- /dev/null +++ b/vecto/cli.py @@ -0,0 +1,42 @@ +import argparse +import vecto + + +class CLI(object): + + def __init__(self): + parser = argparse.ArgumentParser( + description='vecto commad line interface', + add_help=True, + usage='''vecto [], + +The most commonly used vecto commands are: + benchmark Run benchmarks + create_vocab Create vocabulary from a folder +''') + + parser.add_argument('--version', action='version', + version=f'Vecto version {vecto.__version__}') + parser.add_argument('command', help='Subcommand to run') + args, self.unknownargs = parser.parse_known_args() + if not hasattr(self, args.command): + print('Unrecognized command') + parser.print_help() + exit(1) + # use dispatch pattern to invoke method with same name + getattr(self, args.command)() + + def benchmark(self): + from vecto.benchmarks import _run + _run(self.unknownargs) + + def create_vocab(self): + print("CLI for vocabulary routines not implemented yet") + + +def main(): + CLI() + + +if __name__ == '__main__': + main() diff --git a/vecto/embeddings/__init__.py b/vecto/embeddings/__init__.py index 17f56e7c..1dc9daa3 100644 --- a/vecto/embeddings/__init__.py +++ b/vecto/embeddings/__init__.py @@ -13,6 +13,7 @@ import numpy as np import vecto.embeddings.dense from vecto.embeddings.dense import WordEmbeddingsDense +from .legacy_w2v import ModelW2V from vecto.vocabulary import Vocabulary logger = logging.getLogger(__name__) @@ -39,12 +40,7 @@ def load_from_dir(path): # result.load(path) # result.load_metadata(path) # return result -# if os.path.isfile(os.path.join(path, "vectors.bin")): -# logger.info("this is w2v original binary format") -# result = ModelW2V() -# result.load_from_dir(path) -# result.load_metadata(path) -# return result + # if os.path.isfile(os.path.join(path, "sgns.words.npy")): # result = ModelLevy() # logger.info("this is Levi") @@ -79,12 +75,12 @@ def load_from_dir(path): result.vocabulary.load(path) result.load_metadata(path) return result - # if any(file.endswith('bin') for file in os.listdir(path)): - # result = ModelW2V() - # logger.info("Detected VSM in the w2v original binary format") - # result.load_from_dir(path) - # result.load_metadata(path) - # return result + if any(file.endswith('bin') for file in os.listdir(path)): + result = ModelW2V() + logger.info("Detected VSM in the w2v original binary format") + result.load_from_dir(path) + result.load_metadata(path) + return result # if f.startswith("words") and f.endswith(".npy") \ # and os.path.isfile(os.path.join(path, f.replace(".npy", ".vocab"))): # result = Model_Fun() diff --git a/vecto/embeddings/legacy_w2v.py b/vecto/embeddings/legacy_w2v.py new file mode 100644 index 00000000..19e333e4 --- /dev/null +++ b/vecto/embeddings/legacy_w2v.py @@ -0,0 +1,45 @@ +import os +import numpy as np +from vecto.vocabulary import Vocabulary +from .dense import WordEmbeddingsDense + + +class ModelW2V(WordEmbeddingsDense): + """extends dense embeddings to support loading + of original binary format from Mikolov's w2v""" + + @staticmethod + def _load_word(file): + result = b'' + w = b'' + while w != b' ': + w = file.read(1) + result = result + w + return result[:-1] + + def load_from_file(self, filename): + self.vocabulary = Vocabulary() + f = open(filename, "rb") + header = f.readline().split() + cnt_rows = int(header[0]) + size_row = int(header[1]) + # self.name += "_{}".format(size_row) + self.matrix = np.zeros((cnt_rows, size_row), dtype=np.float32) + # logger.debug("cnt rows = {}, size row = {}".format(cnt_rows, size_row)) + for i in range(cnt_rows): + word = ModelW2V._load_word(f).decode( + 'UTF-8', errors="ignore").strip() + self.vocabulary.dic_words_ids[word] = i + self.vocabulary.lst_words.append(word) + s_row = f.read(size_row * 4) + row = np.fromstring(s_row, dtype=np.float32) + # row = row / np.linalg.norm(row) + self.matrix[i] = row + f.close() + + def load_from_dir(self, path): + # self.name += "w2v_" + os.path.basename(os.path.normpath(path)) + filename = [file for file in os.listdir(path) if file.endswith("bin")][0] + self.load_from_file(os.path.join(path, filename)) +# self.load_from_file(os.path.join(path, "vectors.bin")) + # self.load_provenance(path) diff --git a/vecto/utils/__init__.py b/vecto/utils/__init__.py index cccec5bb..8b07a5b3 100644 --- a/vecto/utils/__init__.py +++ b/vecto/utils/__init__.py @@ -1,2 +1,13 @@ """Helpers for various things """ + +import datetime + + +def get_time_str(): + """ + returs current time formatted nicely + """ + time_now = datetime.datetime.now() + str_time = time_now.strftime("%y.%m.%d_%H.%M.%S") + return str_time diff --git a/vecto/utils/data.py b/vecto/utils/data.py index eaf63e87..5a891404 100644 --- a/vecto/utils/data.py +++ b/vecto/utils/data.py @@ -3,6 +3,7 @@ import bz2 import os + def detect_archive_format_and_open(path): if path.endswith(".bz2"): return bz2.open(path, mode='rt') @@ -11,6 +12,10 @@ def detect_archive_format_and_open(path): return open(path, encoding='utf8') +def print_json(data): + print(json.dumps(data, ensure_ascii=False, indent=4, sort_keys=False)) + + def save_json(data, path): basedir = os.path.dirname(path) os.makedirs(basedir, exist_ok=True)