diff --git a/tests/data/benchmarks_results/text_classification/args.json b/tests/data/benchmarks_results/text_classification/args.json index c021774f..9f178149 100644 --- a/tests/data/benchmarks_results/text_classification/args.json +++ b/tests/data/benchmarks_results/text_classification/args.json @@ -1 +1 @@ -{"current_datetime": "2018-05-04 02:29:28.460915", "batchsize": 64, "epoch": 5, "gpu": -1, "layer": 1, "dropout": 0, "model": "bow", "char_based": false, "out": "./tests/data/benchmarks_results/text_classification/", "unit": 4, "dataset": "./tests/data/benchmarks/text_classification/", "vocab_path": "./tests/data/benchmarks_results/text_classification/vocab.json", "model_path": "./tests/data/benchmarks_results/text_classification/best_model.npz", "n_class": 2, "datetime": "2018-05-04 02:29:28.460915"} \ No newline at end of file +{"current_datetime": "2018-05-04 11:39:50.824318", "batchsize": 64, "epoch": 5, "gpu": -1, "layer": 1, "dropout": 0, "model": "bow", "char_based": false, "out": "./tests/data/benchmarks_results/text_classification/", "unit": 4, "dataset": "./tests/data/benchmarks/text_classification/", "vocab_path": "./tests/data/benchmarks_results/text_classification/vocab.json", "model_path": "./tests/data/benchmarks_results/text_classification/best_model.npz", "n_class": 2, "datetime": "2018-05-04 11:39:50.824318"} \ No newline at end of file diff --git a/tests/data/benchmarks_results/text_classification/best_model.npz b/tests/data/benchmarks_results/text_classification/best_model.npz index 66d4d2ad..2fc102ce 100644 Binary files a/tests/data/benchmarks_results/text_classification/best_model.npz and b/tests/data/benchmarks_results/text_classification/best_model.npz differ diff --git a/tests/data/benchmarks_results/text_classification/log b/tests/data/benchmarks_results/text_classification/log index e16dea67..f3d48d60 100644 --- a/tests/data/benchmarks_results/text_classification/log +++ b/tests/data/benchmarks_results/text_classification/log @@ -1,38 +1,38 @@ [ { - "main/loss": 0.6962488293647766, - "main/accuracy": 0.4375, - "validation/main/loss": 0.6995988488197327, - "validation/main/accuracy": 0.46666666865348816, + "main/loss": 0.7636308670043945, + "main/accuracy": 0.46875, + "validation/main/loss": 0.7174736261367798, + "validation/main/accuracy": 0.5333333611488342, "epoch": 1, "iteration": 1, - "elapsed_time": 0.0065841870091389865 + "elapsed_time": 0.00541385097312741 }, { - "main/loss": 0.6889790296554565, - "main/accuracy": 0.53125, - "validation/main/loss": 0.6992061138153076, - "validation/main/accuracy": 0.46666666865348816, + "main/loss": 0.7468587160110474, + "main/accuracy": 0.484375, + "validation/main/loss": 0.716312050819397, + "validation/main/accuracy": 0.5333333611488342, "epoch": 2, "iteration": 2, - "elapsed_time": 0.01622019399655983 + "elapsed_time": 0.012854741973569617 }, { - "main/loss": 0.6849422454833984, - "main/accuracy": 0.5909090638160706, - "validation/main/loss": 0.6990599632263184, - "validation/main/accuracy": 0.46666666865348816, + "main/loss": 0.7709426283836365, + "main/accuracy": 0.4545454680919647, + "validation/main/loss": 0.7152009010314941, + "validation/main/accuracy": 0.5333333611488342, "epoch": 3, "iteration": 3, - "elapsed_time": 0.022680485009914264 + "elapsed_time": 0.020706564973806962 }, { - "main/loss": 0.6885284185409546, - "main/accuracy": 0.5581395626068115, - "validation/main/loss": 0.6989443302154541, - "validation/main/accuracy": 0.46666666865348816, + "main/loss": 0.7399059534072876, + "main/accuracy": 0.5348837375640869, + "validation/main/loss": 0.7141532897949219, + "validation/main/accuracy": 0.5333333611488342, "epoch": 4, "iteration": 4, - "elapsed_time": 0.02903877801145427 + "elapsed_time": 0.026553130999673158 } ] \ No newline at end of file diff --git a/tests/test_benchmarks.py b/tests/test_benchmarks.py index ed3856f9..a5f2d4af 100644 --- a/tests/test_benchmarks.py +++ b/tests/test_benchmarks.py @@ -67,15 +67,15 @@ def test_text_classification(self): tc = Text_classification(model='cnn') result = tc.get_result(embs, path_text_classification_dataset, - "./tests/data/benchmarks_results/text_classification/") + "/tmp/tests/data/benchmarks_results/text_classification/") print(result) tc = Text_classification(model='rnn') result = tc.get_result(embs, path_text_classification_dataset, - "./tests/data/benchmarks_results/text_classification/") + "/tmp/tests/data/benchmarks_results/text_classification/") print(result) tc = Text_classification(model='bow') result = tc.get_result(embs, path_text_classification_dataset, - "./tests/data/benchmarks_results/text_classification/") + "/tmp/tests/data/benchmarks_results/text_classification/") print(result) diff --git a/vecto/benchmarks/analogy/visualize.py b/vecto/benchmarks/analogy/visualize.py index 35de8a96..ae5cf56d 100644 --- a/vecto/benchmarks/analogy/visualize.py +++ b/vecto/benchmarks/analogy/visualize.py @@ -51,7 +51,7 @@ def run_results(path_embeds=["tests/data/embeddings/text/plain_with_file_header" analogy = method() results = analogy.run(embs, path_analogy_dataset) print(results) - save_json(results, os.path.join("tests/data/benchmarks_results/analogy/", datetime.datetime.now().isoformat())) + save_json(results, os.path.join("/tmp/tests/data/benchmarks_results/analogy/", datetime.datetime.now().isoformat())) diff --git a/vecto/benchmarks/similarity/visualize.py b/vecto/benchmarks/similarity/visualize.py index dfd9dcd8..090761b2 100644 --- a/vecto/benchmarks/similarity/visualize.py +++ b/vecto/benchmarks/similarity/visualize.py @@ -50,7 +50,7 @@ def run_results(path_embeds=["tests/data/embeddings/text/plain_with_file_header" similarity = Similarity() results = similarity.run(embs, path_analogy_dataset) print(results) - save_json(results, os.path.join("tests/data/benchmarks_results/similarity/", datetime.datetime.now().isoformat())) + save_json(results, os.path.join("/tmp/tests/data/benchmarks_results/similarity/", datetime.datetime.now().isoformat())) diff --git a/vecto/benchmarks/text_classification/text_classification.py b/vecto/benchmarks/text_classification/text_classification.py index babbd04e..cac24f5e 100755 --- a/vecto/benchmarks/text_classification/text_classification.py +++ b/vecto/benchmarks/text_classification/text_classification.py @@ -49,19 +49,21 @@ def load_model(model_path, wv): return model, vocab, setup + def predict(model, sentence): model, vocab, setup = model sentence = sentence.strip() text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) - xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU + xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): prob = model.predict(xs, softmax=True)[0] answer = int(model.xp.argmax(prob)) score = float(prob[answer]) return answer, score + def get_vectors(model, sentences): model, vocab, setup = model vectors = [] @@ -70,7 +72,7 @@ def get_vectors(model, sentences): text = nlp_utils.normalize_text(sentence) words = nlp_utils.split_text(text, char_based=setup['char_based']) xs = nlp_utils.transform_to_array([words], vocab, with_label=False) - xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU + xs = nlp_utils.convert_seq(xs, device=-1, with_label=False) # todo use GPU with chainer.using_config('train', False), chainer.no_backprop_mode(): vector = model.encoder(xs) vectors.append(vector.data[0]) @@ -78,8 +80,6 @@ def get_vectors(model, sentences): return vectors - - class Text_classification(Benchmark): def __init__(self, batchsize=64, epoch=5, gpu=-1, layer=1, dropout=0, model=['cnn', 'rnn', 'bow'][1], @@ -97,21 +97,23 @@ def get_result(self, embs, path_dataset, path_output='/tmp/text_classification/' self.out = path_output self.unit = embs.matrix.shape[1] + if not os.path.isdir(path_output): + os.makedirs(path_output) # Load a dataset self.dataset = path_dataset if self.dataset == 'dbpedia': train, test, vocab = text_datasets.get_dbpedia( - char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids,) + char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, ) elif self.dataset.startswith('imdb.'): train, test, vocab = text_datasets.get_imdb( fine_grained=self.dataset.endswith('.fine'), - char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids,) + char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, ) elif self.dataset in ['TREC', 'stsa.binary', 'stsa.fine', 'custrev', 'mpqa', 'rt-polarity', 'subj']: train, test, vocab = text_datasets.get_other_text_dataset( - self.dataset, char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids,) - else: # finallly, if file is not downloadable, load from local path + self.dataset, char_based=self.char_based, vocab=embs.vocabulary.dic_words_ids, ) + else: # finallly, if file is not downloadable, load from local path train, test, vocab = text_datasets.get_dataset_from_path(path_dataset, vocab=embs.vocabulary.dic_words_ids, char_based=self.char_based) @@ -194,4 +196,4 @@ def get_result(self, embs, path_dataset, path_output='/tmp/text_classification/' result['experiment_setup'] = experiment_setup result['log'] = load_json(os.path.join(self.out, 'log')) result['result'] = result['log'][-1]['validation/main/accuracy'] - return result \ No newline at end of file + return result