-
Notifications
You must be signed in to change notification settings - Fork 1
/
rank.py
165 lines (153 loc) · 7.63 KB
/
rank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from util.stats import *
import os, pickle, time
from itertools import combinations
# Pre: "indexable" is a list type, "value" is an indexable whose
# elements have comparison operators, "key" is a function to use
# for sorting
# Post: A binary search over the indexable is performed and "value" is
# inserted in a sorted order left of anything with equal key,
# "key" defaults to sorting by direct element value comparison.
def insert_sorted(indexable, value, key=lambda i:i):
low = 0
high = len(indexable)
index = (low + high) // 2
while high != low:
if key(indexable[index]) >= key(value): high = index
elif key(indexable[index]) < key(value): low = index + 1
index = (low + high) // 2
indexable.insert(index, value)
# Pre: "numbers" is a python list of numbers
# Post: All of the values in "numbers" multiplied together
def product(numbers):
value = 1
for n in numbers:
value *= n
return value
# Pre: "indexable" is an object that can be index accessed
# "indexable" must be in sorted order
# "value" is a value that can be compared with values in "indexable"
# Post: The number of elements in "indexable" that are greater than "value"
def count_greater(indexable, value):
low = 0
high = len(indexable)
index = (low + high) // 2
while high != low:
if indexable[index] > value:
high = index
elif indexable[index] <= value:
low = index + 1
else:
string = "High: %i\tLow: %i\tIndex: %i\n"%(high,low,index)
string += "Indexable:\n%s"%str(indexable)
raise(Exception("Encountered invalid number in "+
"comparison.\n\n"+string))
index = (low + high) // 2
return len(indexable) - index
# Pre: "interest" is an iterable of length > 0
# "other_data" is a list of iterables each of length > 0
# "rank" is a 0-indexed position in an ordering, 0'th being best
# "order" is 'True' if order matters for data set values.
# Post: The probability that if random single elements are selected
# from each of the sets in "other_data" and "interest", that the
# value selected from "interest" will be in position "rank" from
# the smallest value
def rank_probability(interest, other_data, rank=0, order=False):
# Check basic assumptions that should hold at the start of this function.
assert(len(interest) > 0)
assert(len(other_data) > 0)
assert(sum(map(len,other_data)) > 0)
# Convert negative ranks into their positive counterparts.
while (rank < 0): rank += len(other_data) + 1
# Prepare all the data in tuple format (less memory) make sure
# that all lists in other data are sorted for improved runtime
interest = sorted(interest)
other_data = [sorted(d) for d in other_data]
# If order matters, cycle through the data once computing
# likelihood and exit.
if order:
length = len(interest)
if not all(l == length for l in map(len,other_data)):
raise(Exception("All provided value series must have same length."))
# Initialize holder for rank occurrence.
rank_count = 0
for group in zip(interest, *other_data):
interest_value = group[0]
order = sorted(group)
rank_count += int(order.index(interest_value) == rank)
# return the probability that "interest" takes "rank" among peers.
return rank_count / length
# If order does *not* matter, then use combinatorics to determine
# the probability that any value in "interest" is the lowest of
# all values.
# Initialize list for holding all probabilities
probabilities = []
# Calculate the total different selections possible
total = product(map(len,other_data))
# Cycle trough and calculate a probability that each value in
# "interest" has exactly "rank" greater than it
for value in interest:
# The probability that value from one list is better than current value
greater = [count_greater(d, value) for d in other_data]
# Handle the special case of rank 0 (no fancy operations necessary).
if rank == 0:
num_outcomes = product(greater)
# Handle the special case of rank -1 (no fancy operations necessary).
elif rank == len(other_data):
num_outcomes = product([(len(d) - g) for (d,g) in zip(other_data,greater)])
# Handle the general case, where we mix lessers and greaters.
else:
# Initialize the number greater to 0
num_outcomes = 0
# Get (in terms of indices) all combinations of the
# other data sets that could be less than value
for s in combinations(range(len(other_data)), rank):
# Get the substitued "lesser" lengths of the selected indices
lesser = [len(other_data[index]) - greater[index] for index in s]
# Temporarily remove those elements from the list of lengths
temp = [greater.pop(index) for index in s[::-1]][::-1]
# Count up the number of possible unique sets
num_outcomes += product(lesser + greater)
# Reinsert the temporarily popped values back
for i,index in enumerate(s):
greater.insert(index,temp[i])
# Append the probability for this value to the list
probabilities.append( num_outcomes / total )
# Return the average of all probabilities (assuming each value is
# equally likely to be selected from interest set)
return sum(probabilities) / len(probabilities)
# Pre: "interest" is an iterable of length > 0
# "other_data" is a list of iterables each of length > 0
# ** The minimum values at each index in "interest" and the
# sub-sequences of "other_data" MUST NOT BE 0
# Post: The ratio defined in More and Wild in "Benchmarking
# Derivative-Free Optimization Algorithms" (2009) as a
# subsequent to performance profiles and data profiles.
def performance_ratio(interest, other_data):
absolute_mins = [min(o) for o in zip(*(other_data+[interest]))]
return [ i / m for i,m in zip(interest, absolute_mins) ]
# Pre: "performance" is a list of performance ratios (numbers) where
# smaller values denote better performances
# "to_beat" is the performance ratio that we want to measure
# against and guage percentage passed
# Post: The percentage of performances that were better than "to_beat"
def performance_profile(performance, to_beat):
return sum(p <= to_beat for p in performance) / len(performance)
# Pre: "performance" is a 2D python list of floats of performances
# [iterations [trials]], where lower is better.
# "best" is a 1D python list of floats of the absolute minimum
# value that could be seen in trials for a given iteration
# "to_beat" is a list of percentages that represent the maximum
# distance from convergence allowed after n iterations for an
# algorithm to be considered "passing"
# Post: The data profile for the performances given is returned in
# terms of a dictionary as defined below
# {max tolerance: [percentage passed per iteration]}
def data_profile(performances, best, to_beat=[10**(-i) for i in (1,3,5,7)]):
profile = {t:[] for t in to_beat}
for trials,abs_min in zip(performances,best):
for t in to_beat:
passed = sum( performances[0][i] - trials[i] >=
(1 - t)*(performances[0][i] - abs_min)
for i in range(len(trials)) ) / len(trials)
profile[t].append( passed )
return profile