Skip to content

Commit

Permalink
cosmetic refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
bakarov committed Jul 31, 2018
1 parent d0777af commit 563a378
Showing 1 changed file with 82 additions and 79 deletions.
161 changes: 82 additions & 79 deletions vecto/benchmarks/analogy/analogy.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ def __init__(self, normalize=True,
"reason": "missing words"
}

self.group_subcategory = False

@property
def method(self):
return type(self).__name__
Expand Down Expand Up @@ -206,7 +208,7 @@ def process_prediction(self, p_test_one, scores, score_reg, score_sim, p_train=[
result["landing_a_prime"] = (ans in all_a_prime)
return result

def run_category(self, pairs, name_category, name_subcategory):
def evaluate(self, pairs):
self.cnt_total_correct = 0
self.cnt_total_total = 0
details = []
Expand Down Expand Up @@ -248,12 +250,6 @@ def run_category(self, pairs, name_category, name_subcategory):
experiment_setup = dict()
experiment_setup["cnt_questions_correct"] = self.cnt_total_correct
experiment_setup["cnt_questions_total"] = self.cnt_total_total
experiment_setup["embeddings"] = self.embs.metadata
experiment_setup["category"] = name_category
experiment_setup["subcategory"] = name_subcategory
experiment_setup["task"] = "word_analogy"
experiment_setup["measurement"] = "accuracy"
experiment_setup["method"] = self.method
if not self.exclude:
experiment_setup["method"] += "_honest"
experiment_setup["timestamp"] = datetime.datetime.now().isoformat()
Expand All @@ -266,59 +262,71 @@ def run_category(self, pairs, name_category, name_subcategory):
# str_results = json.dumps(jsonify(out), indent=4, separators=(',', ': '), sort_keys=True)
return out

def get_pairs(self, fname):
def read_test_sample(self, row):
if '\t' in row:
parts = row.lower().split('\t')
else:
parts = row.lower().split()
word_1 = parts[0]
word_2 = parts[1]
word_2 = word_2.strip()
if '/' in word_2:
word_2 = [i.strip() for i in word_2.split('/')]
else:
word_2 = [i.strip() for i in word_2.split(',')]
return word_1, word_2

def read_test_set(self, path):
pairs = []
with open(fname) as file_in:
with open(path) as file_in:
id_line = 0
for line in file_in:
if line.strip() == '':
for row in file_in:
if row.strip() == '':
continue
try:
pairs.append(self.read_test_sample(row))
id_line += 1
if "\t" in line:
parts = line.lower().split("\t")
else:
parts = line.lower().split()
left = parts[0]
right = parts[1]
right = right.strip()
if "/" in right:
right = [i.strip() for i in right.split("/")]
else:
right = [i.strip() for i in right.split(",")]
pairs.append([left, right])
except:
print("error reading pairs")
print("in file", fname)
print("in line", id_line, line)
except IndexError:
print('Error reading pairs in file {} in line {} {}'.format(path, id_line, row))
exit(-1)
return pairs

def run(self, embs, path_dataset, group_subcategory):
self.embs = embs
def set_group_subcategory(self, group_subcategory):
self.group_subcategory = group_subcategory

def collect_metadata(self, name_category, name_subcategory):
experiment_setup = dict()
experiment_setup['Embeddings'] = self.embs.metadata
experiment_setup['Category'] = name_category
experiment_setup['Subcategory'] = name_subcategory
experiment_setup['Task'] = 'word_analogy'
experiment_setup['Measurement'] = 'accuracy'
experiment_setup['Method'] = self.method
return experiment_setup

def run(self, embeddings, path_dataset):
self.embs = embeddings
if self.normalize:
self.embs.normalize()
self.embs.cache_normalized_copy()

dir_tests = os.path.join(path_dataset)
if not os.path.exists(dir_tests):
raise Exception("test dataset dir does not exist:" + dir_tests)
if not os.path.exists(path_dataset):
raise Exception('Test dataset directory does not exist: {}'.format(path_dataset))
results = []
for root, dirnames, filenames in os.walk(dir_tests):
for filename in fnmatch.filter(sorted(filenames), '*'):
for root, _, file_names in os.walk(path_dataset):
for filename in fnmatch.filter(sorted(file_names), '*'):
if filename.endswith('json'):
continue
logger.info("processing " + filename)
pairs = self.get_pairs(os.path.join(root, filename))
out = self.run_category(pairs, name_category=os.path.basename(os.path.dirname(root)), name_subcategory=filename)
results.append(out)
if group_subcategory:
logger.info('Processing {}'.format(filename))
pairs = self.read_test_set(os.path.join(root, filename))
result = dict()
result['Evaluation_result'] = self.evaluate(pairs)
result['Setup'] = self.collect_metadata(os.path.basename(os.path.dirname(root)), filename)
results.append(result)
if self.group_subcategory:
results.extend(self.group_subcategory_results(results))
return results

def group_subcategory_results(self, results):
# group analogy results, based on the category
new_results = {}
for result in results:
cnt_correct = 0
Expand All @@ -344,20 +352,17 @@ def group_subcategory_results(self, results):
new_results[k]['experiment_setup']['cnt_questions_correct'] = cnt_correct
new_results[k]['experiment_setup']['cnt_questions_total'] = cnt_total
for k, v in new_results.items():
new_results[k]['result'] = new_results[k]['experiment_setup']['cnt_questions_correct'] * 1.0 / new_results[k]['experiment_setup']['cnt_questions_total']
new_results[k]['result'] = new_results[k]['experiment_setup']['cnt_questions_correct'] * 1.0 / \
new_results[k]['experiment_setup']['cnt_questions_total']
out = []
for k, v in new_results.items():
out.append(new_results[k])
return out

#def subsample_dims(self, newdim):
#self.embs.matrix = self.embs.matrix[:, 0:newdim]
#self.embs.name = re.sub("_d(\d+)", "_d{}".format(newdim), self.embs.name)

def get_result(self, embeddings, path_dataset, group_subcategory=False):
def get_result(self, embeddings, path_dataset):
if self.normalize:
embeddings.normalize()
results = self.run(embeddings, path_dataset, group_subcategory)
results = self.run(embeddings, path_dataset)
return results


Expand Down Expand Up @@ -444,39 +449,38 @@ def compute_scores(self, vec_a, vec_a_prime, vec_b):
return scores, predicted


# class SimilarToAny(PairWise):
# def compute_scores(self, vectors):
# scores = self.get_most_similar_fast(vectors)
# best = scores.max(axis=0)
# return best
#
#
# class SimilarToB(Analogy):
# def do_test_on_pairs(self, pairs_train, pairs_test):
# results = []
# for p_test in pairs_test:
# if self.is_pair_missing([p_test]):
# continue
# result = self.do_on_two_pairs(p_test)
# result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0])
# result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0])
# results.append(result)
# return results
#
# def do_on_two_pairs(self, pair_test):
# if self.is_pair_missing([pair_test]):
# result = self.result_miss
# else:
# vec_b = self.embs.get_vector(pair_test[0])
# vec_b_prime = self.embs.get_vector(pair_test[1][0])
# scores = self.get_most_similar_fast(vec_b)
# result = self.process_prediction(pair_test, scores, None, None)
# result["similarity to correct cosine"] = self.embs.cmp_vectors(vec_b, vec_b_prime)
# return result
class SimilarToAny(PairWise):
def compute_scores(self, vectors):
scores = self.get_most_similar_fast(vectors)
best = scores.max(axis=0)
return best


class ThreeCosAvg(Analogy):
class SimilarToB(Analogy):
def do_test_on_pairs(self, pairs_train, pairs_test):
results = []
for p_test in pairs_test:
if self.is_pair_missing([p_test]):
continue
result = self.do_on_two_pairs(p_test)
result["b in neighbourhood of b_prime"] = self.get_rank(p_test[0], p_test[1][0])
result["b_prime in neighbourhood of b"] = self.get_rank(p_test[1], p_test[0])
results.append(result)
return results

def do_on_two_pairs(self, pair_test):
if self.is_pair_missing([pair_test]):
result = self.result_miss
else:
vec_b = self.embs.get_vector(pair_test[0])
vec_b_prime = self.embs.get_vector(pair_test[1][0])
scores = self.get_most_similar_fast(vec_b)
result = self.process_prediction(pair_test, scores, None, None)
result["similarity to correct cosine"] = self.embs.cmp_vectors(vec_b, vec_b_prime)
return result


class ThreeCosAvg(Analogy):
def do_test_on_pairs(self, p_train, p_test):
vecs_a = []
vecs_a_prime = []
Expand Down Expand Up @@ -514,7 +518,6 @@ def do_test_on_pairs(self, p_train, p_test):


class LRCos(Analogy):

def do_test_on_pairs(self, p_train, p_test):
results = []
X_train, Y_train = self.gen_vec_single(p_train)
Expand Down

0 comments on commit 563a378

Please sign in to comment.