-
Notifications
You must be signed in to change notification settings - Fork 0
/
trial_data.py
195 lines (157 loc) · 7.65 KB
/
trial_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from sklearn.decomposition import PCA
from metric_learn import *
from sklearn.neighbors import BallTree, DistanceMetric
import random
import sys
from multiprocessing import Pool
class TrialData(object):
def __init__(self, pca_components=35, cross_validation_k=8,
correct_within_top_fraction=0.1, match_a_to_b=True):
self.pca_components = pca_components
self.cross_validation_k = cross_validation_k
self.correct_within_top_fraction = correct_within_top_fraction
self.match_a_to_b = match_a_to_b
def split_data(self, all_pairs, i):
num_pairs = len(all_pairs)
i = float(i)
start_split = int(i / self.cross_validation_k * num_pairs)
end_split = int((i+1) / self.cross_validation_k * num_pairs)
train = all_pairs[0:start_split] + all_pairs[end_split:]
test = all_pairs[start_split:end_split]
return train, test
def fit_pca(self, train_pairs, test_pairs):
train_pairs_flat = [item for subtuple in train_pairs for item in subtuple]
test_pairs_flat = [item for subtuple in test_pairs for item in subtuple]
pca = PCA(n_components = self.pca_components)
pca.fit(train_pairs_flat)
train_pairs_pca_flat = pca.transform(train_pairs_flat)
test_pairs_pca_flat = pca.transform(test_pairs_flat)
train_pairs_pca = list()
test_pairs_pca = list()
for i in xrange(0, len(train_pairs_pca_flat), 2):
a = i
b = i + 1
train_pairs_pca.append((train_pairs_pca_flat[a],
train_pairs_pca_flat[b]))
for i in xrange(0, len(test_pairs_pca_flat), 2):
a = i
b = i + 1
test_pairs_pca.append((test_pairs_pca_flat[a],
test_pairs_pca_flat[b]))
return train_pairs_pca, test_pairs_pca
def get_full_metric(self, train_pairs):
train_pairs_flat = [item for subtuple in train_pairs for item in subtuple]
pca = PCA(n_components = self.pca_components)
pca.fit(train_pairs_flat)
train_pairs_pca_flat = pca.transform(train_pairs_flat)
train_pairs_pca = list()
for i in xrange(0, len(train_pairs_pca_flat), 2):
a = i
b = i + 1
train_pairs_pca.append((train_pairs_pca_flat[a],
train_pairs_pca_flat[b]))
ys = ys_from_pairs(train_pairs_pca)
file_id = str(random.random())[2:]
save_cvx_params(ys, file_id)
run_cvx(file_id)
M = load_cvx_result(file_id)
dist = DistanceMetric.get_metric('mahalanobis', VI = M)
return dist, M, pca
def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data):
print "Running PCA..."
train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs)
ys = ys_from_pairs(train_pairs_pca)
file_id = str(random.random())[2:]
save_cvx_params(ys, file_id)
run_cvx(file_id)
M = load_cvx_result(file_id)
dist = DistanceMetric.get_metric('mahalanobis', VI = M)
train_a_sections = [x[0] for x in train_pairs_pca]
train_b_sections = [x[1] for x in train_pairs_pca]
test_a_sections = [x[0] for x in test_pairs_pca]
test_b_sections = [x[1] for x in test_pairs_pca]
train_given_sections = train_a_sections
train_to_match_sections = train_b_sections
test_given_sections = test_a_sections
test_to_match_sections = test_b_sections
if self.match_a_to_b:
train_given_sections = train_b_sections
train_to_match_sections = train_a_sections
test_given_sections = test_b_sections
test_to_match_sections = test_a_sections
print "Constructing BallTrees..."
train_bt = BallTree(train_to_match_sections, metric=dist)
test_bt = BallTree(test_to_match_sections, metric=dist)
train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction)
test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction)
print "Querying the BallTrees..."
train_result = train_bt.query(train_given_sections, train_top_fraction)
test_result = test_bt.query(test_given_sections, test_top_fraction)
print "Looking at correctness of results..."
train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))])
test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))])
print "Finding indices of correct matches..."
test_result_full = test_bt.query(test_given_sections, len(test_given_sections))
def default_index(lst, i):
ind = -1
try:
ind = lst.index(i)
except:
pass
return ind
test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))]
test_indices = [x for x in test_indices if x != -1]
with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f:
for i, index in enumerate(test_indices):
if index == 0:
successful_tunes_f.write(str(test_tune_data[i]) + '\n\n')
return [[train_correct, len(train_given_sections)],
[test_correct, len(test_given_sections)]], test_indices
def print_results(self, results, indices, outfile=sys.stdout, indices_outfile=sys.stdout):
((train_correct, num_train),
(test_correct, num_test)) = results
outfile.write( """Ran {}-fold cross validation with {} PCA'd components, matching {} sections
to {} sections, and checking whether the corresponding section was in the
top {}% closest by the learned metric.
"""\
.format(self.cross_validation_k, self.pca_components,
'A' if self.match_a_to_b else 'B', 'B' if self.match_a_to_b else 'A',
self.correct_within_top_fraction * 100))
outfile.write( """ON THE TEST SET:
{} correct by learned metric, {} total
"""\
.format(test_correct, num_test))
outfile.write( """ON THE TRAINING SET:
{} correct by learned metric, {} total
"""\
.format(train_correct, num_train))
outfile.write("\n")
indices_outfile.write(str(indices))
indices_outfile.write("\n")
def run_trial(self, all_pairs, tune_data, outfile=sys.stdout, indices_outfile=sys.stdout):
pool = Pool()
#print len(all_pairs), len(tune_data)
map_result = pool.map_async(single_trial,
[(i, self, all_pairs, tune_data) for i in xrange(self.cross_validation_k)], 1)
pool.close()
pool.join()
results_list = map_result.get()
results = [[0,0], [0,0]]
all_indices = []
for result, indices in results_list:
all_indices.extend(indices)
for j in xrange(2):
for k in xrange(2):
results[j][k] += result[j][k]
self.print_results(results, all_indices, outfile, indices_outfile)
def single_trial((i, trial_data, all_pairs, tune_data)):
print "Running the {}th cross validation trial...".format(i+1)
train, test = trial_data.split_data(all_pairs, i)
train_tune_data, test_tune_data = trial_data.split_data(tune_data, i)
return trial_data.run_single_trial(train, test, train_tune_data, test_tune_data)
if __name__ == '__main__':
pairs, tune_data = build_a_b_pairs_vector(2, 6)
with open('trial_data.log_norhythm_btoa', 'w') as outfile:
with open('trial_data_indices_norhythm_btoa', 'w') as indices_outfile:
trial_data = TrialData(pca_components=35)
trial_data.run_trial(pairs, tune_data, outfile, indices_outfile)