Skip to content

Commit

Permalink
improve the test rate
Browse files Browse the repository at this point in the history
  • Loading branch information
libofang committed Jun 18, 2018
1 parent b31b832 commit 0a89488
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 114 deletions.
29 changes: 0 additions & 29 deletions vecto/benchmarks/analogy/__main__.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,2 @@
import sys
import yaml
from .analogy import Analogy


def main():
if len(sys.argv) > 1:
path_config = sys.argv[1]
else:
print("usage: python3 -m vecto.benchmarks.analogy <config file>")
print("config file example can be found at ")
print("https://github.com/undertherain/vsmlib/blob/master/vsmlib/benchmarks/analogy/config_analogy.yaml")
return


with open(path_config, 'r') as ymlfile:
cfg = yaml.load(ymlfile)
options = {}
options["name_method"] = cfg["method"]
options["exclude"] = cfg["exclude"]
options["path_dataset"] = cfg["path_dataset"]
options["path_results"] = cfg["path_results"]
options["normalize"] = cfg["normalize"]
options["path_vectors"] = cfg["path_vectors"]

analogy = Analogy()
analogy.run()


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions vecto/benchmarks/similarity/similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
BENCHMARK = 'benchmark'
METADATA_EXT = '.json'
PLAINTEXT_EXT = '.txt'
CSV_EXT = '.csv'
OTHER_EXT = 'None'


Expand Down Expand Up @@ -80,6 +81,9 @@ def read_single_dataset(self, path_to_dir, file_name):
elif file_extension == PLAINTEXT_EXT:
data = self.read_test_set(os.path.join(path_to_dir, file_name))
return BENCHMARK, dataset_name, data
elif file_extension == CSV_EXT:
data = self.read_test_set(os.path.join(path_to_dir, file_name))
return BENCHMARK, dataset_name, data
else:
return OTHER_EXT, None, None

Expand Down
48 changes: 24 additions & 24 deletions vecto/benchmarks/text_classification/nlp_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,30 @@ def normalize_text(text):
return text.strip().lower()


def make_vocab(dataset, max_vocab_size=20000, min_freq=2):
counts = collections.defaultdict(int)
for tokens, _ in dataset:
for token in tokens:
counts[token] += 1

vocab = {'<eos>': 0, '<unk>': 1}
for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
if len(vocab) >= max_vocab_size or c < min_freq:
break
vocab[w] = len(vocab)
return vocab


def read_vocab_list(path, max_vocab_size=20000):
vocab = {'<eos>': 0, '<unk>': 1}
with io.open(path, encoding='utf-8', errors='ignore') as f:
for l in f:
w = l.strip()
if w not in vocab and w:
vocab[w] = len(vocab)
if len(vocab) >= max_vocab_size:
break
return vocab
# def make_vocab(dataset, max_vocab_size=20000, min_freq=2):
# counts = collections.defaultdict(int)
# for tokens, _ in dataset:
# for token in tokens:
# counts[token] += 1
#
# vocab = {'<eos>': 0, '<unk>': 1}
# for w, c in sorted(counts.items(), key=lambda x: (-x[1], x[0])):
# if len(vocab) >= max_vocab_size or c < min_freq:
# break
# vocab[w] = len(vocab)
# return vocab


# def read_vocab_list(path, max_vocab_size=20000):
# vocab = {'<eos>': 0, '<unk>': 1}
# with io.open(path, encoding='utf-8', errors='ignore') as f:
# for l in f:
# w = l.strip()
# if w not in vocab and w:
# vocab[w] = len(vocab)
# if len(vocab) >= max_vocab_size:
# break
# return vocab


def make_array(tokens, vocab, add_eos=True):
Expand Down
118 changes: 57 additions & 61 deletions vecto/benchmarks/text_classification/text_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

import chainer

from vecto.benchmarks.text_classification.nlp_utils import make_vocab
from vecto.benchmarks.text_classification.nlp_utils import normalize_text
from vecto.benchmarks.text_classification.nlp_utils import split_text
from vecto.benchmarks.text_classification.nlp_utils import transform_to_array
Expand Down Expand Up @@ -55,40 +54,40 @@
# return train, test, vocab


def download_imdb():
path = chainer.dataset.cached_download(URL_IMDB)
tf = tarfile.open(path, 'r')
# To read many files fast, tarfile is untared
path = tempfile.mkdtemp()
tf.extractall(path)
return path


def read_imdb(path, split,
shrink=1, fine_grained=False, char_based=False):
fg_label_dict = {'1': 0, '2': 0, '3': 1, '4': 1,
'7': 2, '8': 2, '9': 3, '10': 3}

def read_and_label(posneg, label):
dataset = []
target = os.path.join(path, 'aclImdb', split, posneg, '*')
for i, f_path in enumerate(glob.glob(target)):
if i % shrink != 0:
continue
with io.open(f_path, encoding='utf-8', errors='ignore') as f:
text = f.read().strip()
tokens = split_text(normalize_text(text), char_based)
if fine_grained:
# extract from f_path. e.g. /pos/200_8.txt -> 8
label = fg_label_dict[f_path.split('_')[-1][:-4]]
dataset.append((tokens, label))
else:
dataset.append((tokens, label))
return dataset

pos_dataset = read_and_label('pos', 0)
neg_dataset = read_and_label('neg', 1)
return pos_dataset + neg_dataset
# def download_imdb():
# path = chainer.dataset.cached_download(URL_IMDB)
# tf = tarfile.open(path, 'r')
# # To read many files fast, tarfile is untared
# path = tempfile.mkdtemp()
# tf.extractall(path)
# return path


# def read_imdb(path, split,
# shrink=1, fine_grained=False, char_based=False):
# fg_label_dict = {'1': 0, '2': 0, '3': 1, '4': 1,
# '7': 2, '8': 2, '9': 3, '10': 3}
#
# def read_and_label(posneg, label):
# dataset = []
# target = os.path.join(path, 'aclImdb', split, posneg, '*')
# for i, f_path in enumerate(glob.glob(target)):
# if i % shrink != 0:
# continue
# with io.open(f_path, encoding='utf-8', errors='ignore') as f:
# text = f.read().strip()
# tokens = split_text(normalize_text(text), char_based)
# if fine_grained:
# # extract from f_path. e.g. /pos/200_8.txt -> 8
# label = fg_label_dict[f_path.split('_')[-1][:-4]]
# dataset.append((tokens, label))
# else:
# dataset.append((tokens, label))
# return dataset
#
# pos_dataset = read_and_label('pos', 0)
# neg_dataset = read_and_label('neg', 1)
# return pos_dataset + neg_dataset


def get_dataset_from_path(path_dataset, vocab=None, shrink=1,
Expand All @@ -100,38 +99,35 @@ def get_dataset_from_path(path_dataset, vocab=None, shrink=1,
shrink=shrink,
char_based=char_based)

if vocab is None:
print('constract vocabulary based on frequency')
vocab = make_vocab(train)

train = transform_to_array(train, vocab)
test = transform_to_array(test, vocab)

return train, test, vocab


def get_imdb(vocab=None, shrink=1, fine_grained=False,
char_based=False):
tmp_path = download_imdb()

print('read imdb')
train = read_imdb(tmp_path, 'train',
shrink=shrink, fine_grained=fine_grained,
char_based=char_based)
test = read_imdb(tmp_path, 'test',
shrink=shrink, fine_grained=fine_grained,
char_based=char_based)

shutil.rmtree(tmp_path)

if vocab is None:
print('constract vocabulary based on frequency')
vocab = make_vocab(train)

train = transform_to_array(train, vocab)
test = transform_to_array(test, vocab)

return train, test, vocab
#
# def get_imdb(vocab=None, shrink=1, fine_grained=False,
# char_based=False):
# tmp_path = download_imdb()
#
# print('read imdb')
# train = read_imdb(tmp_path, 'train',
# shrink=shrink, fine_grained=fine_grained,
# char_based=char_based)
# test = read_imdb(tmp_path, 'test',
# shrink=shrink, fine_grained=fine_grained,
# char_based=char_based)
#
# shutil.rmtree(tmp_path)
#
# if vocab is None:
# print('constract vocabulary based on frequency')
# vocab = make_vocab(train)
#
# train = transform_to_array(train, vocab)
# test = transform_to_array(test, vocab)
#
# return train, test, vocab

#
# def download_other_dataset(name):
Expand Down

0 comments on commit 0a89488

Please sign in to comment.