/
analysis.py
586 lines (490 loc) · 20.2 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
import db
import imageGen
import json
import operator
import os
import time
from PIL import Image
from math import floor, exp
from numpy import *
from scipy.stats import t, truncnorm
mu0 = 255.0 / 2.0 # prior mean
l0 = 0.05 # confidence in prior mean
sig_sq0 = (256.0 / 4.0)**2 # prior variance
a0 = 0.05 # confidence in prior variance
DISPERSION_PARAMETER = 200.0
walk_in = 40
samples = 5
"""
Function: get_image_matrix
Gets the image representing a matrix from the image's image_id. Memoizes the
responses to not waste time
Parameters:
image_id - id of the image to get a matrix representation of
Returns:
Numpy.ndarray of the image
"""
image_matrices = {}
def get_image_matrix(image_id):
# make sure not an ObjectId
image_id = str(image_id)
# check if have already calculated the image matrix for it
if image_id in image_matrices:
return image_matrices[image_id]
# calculate the matrix for it, store it, and return it
else:
image_file = db.get_image_file(image_id)
image_matrices[image_id] = array(imageGen.loadImage(image_file))
return image_matrices[image_id]
"""
Function: images_in_group
Calculate and return the list of image ids in the gived group.
Parameters:
partition - the partition of the objects
group - the group who's images we need to return
Returns:
A list of images in the group.
"""
def images_in_group(partition, group):
group_members = []
for key, val in partition.iteritems():
if val == group:
group_members.append(key)
return group_members
"""
Function: groups_in_partition
Calculate the number of groups currently being used by the partition
Parameters:
partition - the partition to count groups in
Returns:
The number of groups in the partition
"""
def groups_in_partition(partition):
groups = []
for key, val in partition.iteritems():
if val not in groups:
groups.append(val)
return groups
"""
Function: log_prior
Calculate the log probability of a given partition of i+1 elements given the partition of i elements.
Parameters:
oldPartition - the old partitioning of the previous i elements
group - the group that the new object is being added to
Returns:
The probability of the new partition given the old
"""
def log_prior(oldPartition, group, dispersion):
i = len(oldPartition)
num_in_group = len(images_in_group(oldPartition, group)) + 1
denominator = i - 1.0 + dispersion
if num_in_group == 1:
# this is the only image in this group
return log(dispersion / denominator)
else:
return log(num_in_group / denominator)
"""
Function: discete_trunc_t_logpdf
Calulates a discrete truncated logpdf for a given set of values
Parameters:
x - array-like values to be compared
df - array-like the degrees of freedom
loc - array-like the center point of the t distribution
scale - array-like the scale of the t distribution
domain: list of the allowed values in the domain of the discrete distribution
Returns:
The log_prob for the x values
"""
def discrete_trunc_t_logpdf(x, df, domain, loc=0, scale=1):
# make sure is numpy array
x = array(x)
# get the indices of the values we are interested in
indices = empty(x.shape)
for index, val in ndenumerate(x):
indices[index] = domain.index(val)
# number of elements in domain
n = len(domain)
# the shape for each of the values in the domain
shape = (n,) + x.shape
# get the values from the unmodified t over the domain
all_log_prob = empty(shape)
for i in range(n):
all_log_prob[i] = t.logpdf(domain[i]*ones(x.shape), df, loc=loc, scale=scale)
# normalize these values
total_log_prob = logaddexp.reduce(all_log_prob, axis=0)
norm_log_prob = all_log_prob - total_log_prob
# get the values needed to return
log_prob = empty(x.shape)
for i, val in ndenumerate(x):
# first get the index of the domain that x is at
index = domain.index(val)
# now get the appropriate value for the log_prob of x
log_prob[i] = norm_log_prob[(index,)+i]
return log_prob
"""
Function: log_likelihood
Calculates the log_likelihood of observering the set of features of the current
object given that it is from the group group_num
Parameters:
current_partition - the current partitioning of the other images
group_num - the group that the current image would be assigned to
move - the move dict holding the information about the current move
"""
def log_likelihood(current_partition, group_num, image_id, prior_mean, mean_conf, prior_var, var_conf):
# the images grouped so far
images = images_in_group(current_partition, group_num)
# matrix for image
image_matrix = get_image_matrix(image_id)
# number of images
n = len(images)
if n > 0:
# get the shape for the group
image_shape = image_matrix.shape
shape = (n, image_shape[0], image_shape[0])
# get the matricies for all the images in the group
group = empty(shape)
for i in range(n):
group[i] = get_image_matrix(images[i])
# get mean, variance matrix
group_mean = mean(group, axis=0)
group_var = var(group, axis=0)
else:
group_mean = 0
group_var = 0
# calculate the parameters of the student_t distribution
l = mean_conf + n
a = var_conf + n
mu = (mean_conf * prior_mean + n * group_mean) / l
sig_sq = (var_conf * prior_var + (n - 1) * group_var +
mean_conf * n * (prior_mean - group_mean) ** 2 / l) / a
scale = sig_sq * (1 + 1 / l)
# calculate the log probability over each dimension
# from timing this takes about 0.09 sec
log_p = discrete_trunc_t_logpdf(image_matrix, a, range(256), loc=mu, scale=scale)
return sum(log_p)
"""
Function: move_probability
Calculates the probability of a given move
Parameters:
current_partition - representation of the current partition for the images. Is a dict where each key is an image ID, and each value is the group number that image is in
move - dict of the move being made. Same form as in db
current_partition - representation of the current partition fo the images. Is a dict where each key is an image ID, and each value is the group number that image is in
newGroup - the group that the image is actually being moved into, if any
image_id - the id of the image being added to the partition
Returns:
The probability of the move
"""
def move_probability(current_partition, image_id, newGroup=None, prior_mean=mu0, mean_conf=l0, prior_var=sig_sq0, var_conf=a0, dispersion=DISPERSION_PARAMETER):
moveProbabilities = {}
groups = groups_in_partition(current_partition)
if newGroup != None and newGroup not in groups:
# if the move moves the image into a new group, add that group to groups.
groups.append(newGroup)
else:
# otherwise, add the minimum val less than 8 to groups
for i in range(8):
if i not in groups:
groups.append(i)
break
# for each of the groups in groups, calculate the probability of that one being selected
for group in groups:
# the probability of a given move is the likelihood times the prior given the move that happened and all prior data
likelihood = log_likelihood(current_partition, group, image_id, prior_mean, mean_conf, prior_var, var_conf)
prior = log_prior(current_partition, group, dispersion)
moveProbabilities[group] = likelihood + prior
# normalize all the probabilities properly
total = logaddexp.reduce(moveProbabilities.values())
for key, val in moveProbabilities.iteritems():
moveProbabilities[key] = val - total
return moveProbabilities
"""
Function: compare_trial
Compares the moves in a trial to the moves that a particle filter would have made
Parameters:
trial_id - the ObjectId of the trial to analyze
move_probability - a function for calculating the probability of a given move. Takes arguments of current_partition and move, where current_partition is a dict of image_id to group number, and move is a dict for a move
Returns:
Move objects (as described in readme) augmented with move_probs, a dictionary of group-probability pairs, and partition, a dictionary of image_id-group pairs.
"""
def compare_trial(trial_id, move_probability):
# get the trial
trial = db.get_trial(trial_id)
# list of the images initially currently grouped
initial_images = [x for x in trial['init_state'] if x['group'] != -1]
# a dict of the currently partitioned objects. The key is the image_id, and
# the value is the group
current_partition = {}
for image in initial_images:
image_id = str(image['_id'])
current_partition[image_id] = image['group']
# iterate over each move in the trial
for move in trial['moves']:
# calculate the normalized log probability of each potential move according to the particle filter
probs = move_probability(current_partition, move['image_id'], move['new_group'])
# augment the move object
move['move_results'] = probs
move['partition'] = copy(current_partition)
# update the current partition
current_partition[move['image_id']] = move['new_group']
return trial
"""
Function: run_all_trials
Gathers all trial ids from the database and runs them all through compare_trial, saving the results out into a file.
This method now runs each child through compare_trial with find_params_for_move, rather than move_probability.
"""
def run_all_trials():
# runs all the trials and writes out all data into a json string saved to results.json
trials = db.get_all_trials()
trialsData = {}
for trial in trials:
trial = compare_trial(trial._id, find_params_for_move)
# find a results file that does not yet exist
base = "./results"
name = base
number = 0
while os.path.exists(name + ".json"):
name = base + str(number)
number += 1
f = open(name + ".json")
json.dump(trials, f)
f.close()
"""
Gibbs Sampling Functions
These functions sample the various parameters of our particle filter model for the gibbs sampler.
"""
def sample_mu(current_partition, image_id, group, params):
#print "Sampling mu..."
# new random mu
new_mu = truncnorm.rvs((0.0-params[0])/10.0, (255.0-params[0])/10.0, params[0], 10.0) # pretty random...
#print "original is " + str(params[0])
#print "new is " + str(new_mu)
# sample both probs
mu_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4])[group]
new_mu_log_prob = move_probability(current_partition, image_id, group, new_mu, params[1], params[2], params[3], params[4])[group]
diff = exp(new_mu_log_prob - mu_log_prob)
# choose one
prob = mu_log_prob
if diff >= 1.0 or random.uniform(0.0, 1.0) <= diff:
params[0] = new_mu
prob = new_mu_log_prob
return [params, prob]
def sample_mu_conf(current_partition, image_id, group, params):
#print "Sampling mu_conf..."
# new random mu_conf
new_mu_conf = truncnorm.rvs((0.0-params[1])/0.25, (1.0-params[1])/0.25, params[1], 0.25) # pretty random...
#print "original is " + str(params[1])
#print "new is " + str(new_mu_conf)
# sample both probs
mu_conf_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4])[group]
new_mu_conf_log_prob = move_probability(current_partition, image_id, group, params[0], new_mu_conf, params[2], params[3], params[4])[group]
diff = exp(new_mu_conf_log_prob - mu_conf_log_prob)
# choose one
prob = mu_conf_log_prob
if diff >= 1.0 or random.uniform(0.0, 1.0) <= diff:
params[1] = new_mu_conf
prob = new_mu_conf_log_prob
return [params, prob]
def sample_var(current_partition, image_id, group, params):
#print "Sampling variance..."
# new random var
new_var = truncnorm.rvs((0.0-params[2])/8.0, (256.0*256.0-params[2])/8.0, params[1], 8.0) # pretty random...
#print "original is " + str(params[2])
#print "new is " + str(new_var)
# sample both probs
var_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4])[group]
new_var_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], new_var, params[3], params[4])[group]
diff = exp(new_var_log_prob - var_log_prob)
# choose one
prob = var_log_prob
if diff >= 1.0 or random.uniform(0.0, 1.0) <= diff:
params[2] = new_var
prob = new_var_log_prob
return [params, prob]
def sample_var_conf(current_partition, image_id, group, params):
#print "Sampling variance conf..."
# new random var_conf
new_var_conf = truncnorm.rvs((0.0-params[3])/0.25, (1.0-params[3])/0.25, params[3], 0.25) # pretty random...
#print "original is " + str(params[3])
#print "new is " + str(new_var_conf)
# sample both probs
var_conf_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4])[group]
new_var_conf_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], new_var_conf, params[4])[group]
diff = exp(new_var_conf_log_prob - var_conf_log_prob)
# choose one
prob = var_conf_log_prob
if diff >= 1.0 or random.uniform(0.0, 1.0) <= diff:
params[3] = new_var_conf
prob = new_var_conf_log_prob
return params, prob
def sample_disp(current_partition, image_id, group, params):
#print "Sampling distribution..."
# new random disp
new_disp = truncnorm.rvs((0.0-params[4])/4.0, (1000.0-params[4])/4.0, params[4], 4.0) # pretty random...
#print "original is " + str(params[4])
#print "new is " + str(new_disp)
# sample both probs
disp_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4])[group]
new_disp_log_prob = move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], new_disp)[group]
diff = exp(new_disp_log_prob - disp_log_prob)
# choose one
prob = disp_log_prob
if diff >= 1.0 or random.uniform(0.0, 1.0) <= diff:
params[4] = new_disp
prob = new_disp_log_prob
return [params, prob]
"""
Function: find_params_for_move
Runs gibbs over the variables in our particle filter, attempting to find the ideal parameters for a given human move.d
Variables to Sample:
mu - mean - >= 0, <= 255
mu_conf - confidence in prior mean -
var - variance - >= 0, <=
var_conf - confidence in prior variance -
dispersion - dispersion parameter to dirichlet - > 0
All moves are made via a random sample from a normal distribution
Parameters:
current_partition - the current partition of the images into groups
image_id - the id of the image that is being moved
group - the group that the image was moved into
Returns:
The mean of the samples taken
"""
def find_params_for_move(current_partition, image_id, group):
# first generate a random start point
mu = random.normal(255.0/2.0, 256.0/4.0)
mu_conf = random.uniform(0.0, 1.0)
sig = random.normal((255.0/4.0) ** 0.5, ((255.0/4.0) ** 0.5)/8.0) # pretty arbitrary normal dist params
var_conf = random.uniform(0.0, 1.0)
disp = random.normal(50.0, 25.0) # another pretty arbitrary distribution
params = [mu, mu_conf, sig, var_conf, disp]
sample_params = []
# iterate across the variables, testing each new suggestion in turn
print "Walking in...."
for i in range(walk_in):
# sample mu
params, prob = sample_mu(current_partition, image_id, group, params)
# sample mu_conf
params, prob = sample_mu_conf(current_partition, image_id, group, params)
# sample var
params, prob = sample_var(current_partition, image_id, group, params)
# sample var_conf
params, prob = sample_var_conf(current_partition, image_id, group, params)
# sample disp
params, prob = sample_disp(current_partition, image_id, group, params)
print "Sampling..."
for j in range(samples):
# sample mu
params, prob = sample_mu(current_partition, image_id, group, params)
# sample mu_conf
params, prob = sample_mu_conf(current_partition, image_id, group, params)
# sample var
params, prob = sample_var(current_partition, image_id, group, params)
# sample var_conf
params, prob = sample_var_conf(current_partition, image_id, group, params)
# sample disp
params, prob = sample_disp(current_partition, image_id, group, params)
print "New sample: " + str(params)
# save the sample we just generated
sample_params.append(params[:])
print "Move of image " + str(image_id) + " to group " + str(group) + " found params: " + str(params)
print "Resulting probability is " + str(move_probability(current_partition, image_id, group, params[0], params[1], params[2], params[3], params[4]))
print "mean is " + str(mean(array(sample_params), axis=0))
return mean(array(sample_params), axis=0)
"""
Function: randomize
Randomize the list by permuting in place. Knuth's algorithm
Parameters:
l - the list to randomize
"""
def randomize(l):
for i in range(len(l) - 1, -1, -1):
randIndex = int(floor(random.uniform(0, i+1)))
switchElem = l[randIndex]
otherElem = l[i]
l[randIndex] = otherElem
l[i] = switchElem
"""
Function: decide_group
Given an image and a partition, decides with group to add the image to.
Parameters:
partition - the exisiting partition of the data
image_id - the id of the image that is going to be added
Returns:
The group to add the image to.
"""
def decide_group(partition, image_id):
probabilities = move_probability(partition, image_id)
print 'Probabilities:', probabilities
sorted_probabilities = sorted(probabilities.iteritems(), key=operator.itemgetter(1))
maxVal = -inf
group = -1
for key, val in probabilities.iteritems():
if val > maxVal:
group = key
maxVal = val
return (probabilities, group)
"""
Function: sort_image_set
Returns a sorted set of image groups given sort_image_set. This is a sanity checker for the main job of analysis: analyzing all the sample human data we have gathered.
Parameters:
set_id - the id of the image set to sort
"""
def sort_image_set(set_id):
# get the image set
image_set = db.get_image_set(set_id)
# get everything from the images field
images = image_set['images']
# randomize images
randomize(images)
# for each image in the list, run it through the algoritm
partition = {}
probs = {}
for image in images:
image_id = str(image['image_id'])
probs, group = decide_group(partition, image_id)
print "Adding image to group " + str(group)
partition[image_id] = group
probs[image_id] = probs
return (probs, partition)
"""
Function: sort_and_print_set
Prints out the groupings of the images in the set given by the provided id
Parameters:
set_id - the id of the image set to sort
"""
def sort_and_print_set(set_id=None):
if set_id == None:
# choose a set
image_sets = db.get_all_image_sets()
set_id = image_sets[0]['_id']
print "Sorting image set with id " + str(set_id)
# partition it
(probs, partition) = sort_image_set(set_id)
print "Image set sorted"
# save the results to disk
base = "./sort"
name = base
number = 0
while os.path.exists(name) or os.path.exists(name + ".txt"):
name = base + str(number)
number += 1
# print the results
os.makedirs(name + "/")
f = open(name + ".txt", 'w')
f.write("SORT\n")
groups = groups_in_partition(partition)
for group in groups:
print "Group " + str(group) + ":"
f.write("Group " + str(group) + ":\n")
group_images = images_in_group(partition, group)
i = 0
for image in group_images:
fileName = name + "/" + str(group) + "-" + str(i) + ".png"
imageObj = Image.open(db.get_image_file(image))
imageObj.save(fileName, "PNG")
print "\t" + fileName
f.write("\t" + fileName + "\n")
i += 1
print "DONE"
f.write("DONE\n")
f.close()