Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Fixed some of the test code and added comments

  • Loading branch information...
commit 8ee308d52778eb07d54cd856cda623df5097ed3c 1 parent 916370f
@MalcolmSlaney MalcolmSlaney authored
Showing with 38 additions and 25 deletions.
  1. +16 −7 CalculateLSHParameters.m
  2. +1 −1  doc/examples.html
  3. +21 −17 lsh.py
View
23 CalculateLSHParameters.m
@@ -381,6 +381,8 @@
results.exactCost = optimalCost;
results.wExactK = binK;
results.wExactL = binL;
+results.wCandidateCount = N*optimalL.*choose(floor(binK),r).* ...
+ binAnyProb.^(floor(binK)-r) .* binAnyProb2.^r;
fprintf('Exact Optimization:\n');
fprintf('\tFor %d points of data use: ', N);
@@ -396,23 +398,30 @@
% nnHitProbL1 = binNnProb(optimalBin)^desiredOptimalK;
% anyHitProbL1 = binAnyProb(optimalBin)^desiredOptimalK;
% From the definition of p_nn in Eq. (46)
-nnHitProbL1 = choose(desiredOptimalK, r)*binNnProb(optimalBin)^(desiredOptimalK-r)*...
+nnHitProbL1 = choose(desiredOptimalK, r) * ...
+ binNnProb(optimalBin)^(desiredOptimalK-r)*...
binNnProb2(optimalBin)^(r);
-anyHitProbL1 = choose(desiredOptimalK, r)*binAnyProb(optimalBin)^(desiredOptimalK-r)*...
+anyHitProbL1 = choose(desiredOptimalK, r) * ...
+ binAnyProb(optimalBin)^(desiredOptimalK-r)*...
binAnyProb2(optimalBin)^(r);
nnHitProb = 1 - (1-nnHitProbL1)^desiredOptimalL;
anyHitProb = 1 - (1-anyHitProbL1)^desiredOptimalL;
fprintf('Expected statistics for optimal solution:\n');
-fprintf('\tAssuming K=%d, L=%d, hammingR=%d\n', desiredOptimalK, desiredOptimalL, r);
+fprintf('\tAssuming K=%d, L=%d, hammingR=%d\n', desiredOptimalK, ...
+ desiredOptimalL, r);
fprintf('\tp_nn(w) is %g\n', binNnProb(optimalBin));
fprintf('\tp_any(w) is %g\n', binAnyProb(optimalBin));
fprintf('\tProbability of finding NN for L=1: %g\n', nnHitProbL1);
-fprintf('\tProbability of finding ANY for L=1: %g\n', anyHitProbL1);
-fprintf('\tProbability of finding NN for L=%d: %g\n', desiredOptimalL, nnHitProb);
-fprintf('\tProbability of finding ANY for L=%d: %g\n', desiredOptimalL, anyHitProb);
-fprintf('\tExpected number of hits per query: %g\n', anyHitProb*N);
+fprintf('\tProbability of finding ANY for L=1: %g\n', ...
+ anyHitProbL1);
+fprintf('\tProbability of finding NN for L=%d: %g\n', ...
+ desiredOptimalL, nnHitProb);
+fprintf('\tProbability of finding ANY for L=%d: %g\n', ...
+ desiredOptimalL, anyHitProb);
+fprintf('\tExpected number of hits per query: %g\n', ...
+ results.wCandidateCount(optimalBin));
%%
if debugPlot
View
2  doc/examples.html
@@ -70,7 +70,7 @@
<p>
Run this Python command to generate some data. This creates the testData005.data and testData005.nn
files, which contain the raw data and the nearest-neighbor data.
-<PythonCode> python2.6 lsh.py -d 5 -histogram
+<PythonCode> python2.6 lsh.py -d 5 -create
</PythonCode>
Now we can create the distance histograms.
<PythonCode> python2.6 lsh.py -d 5 -histogram
View
38 lsh.py
@@ -402,12 +402,14 @@ def FindXXObsolete(self, data):
reverse=True)
return [(self.FindID(i),c) for (i,c) in s]
- def Find(self, data, multiprobeR=0):
+ def Find(self, queryData, multiprobeR=0):
'''Find some data in all the LSH tables. Use Multiprobe, with
- the given radius, to search neighboring buckets.'''
+ the given radius, to search neighboring buckets. Return a list of
+ results. Each result is a tuple consisting of the candidate ID
+ and the number of times it was found in the index.'''
results = {}
for p in self.projections:
- ids = p.Find(data,multiprobeR)
+ ids = p.Find(queryData, multiprobeR)
# print "Got back these IDs from p.Find:", ids
for id in ids:
if id in results:
@@ -418,13 +420,14 @@ def Find(self, data, multiprobeR=0):
reverse=True)
return [(self.FindID(i),c) for (i,c) in s]
- def FindExact(self, data, GetData, multiprobeR=0):
+ def FindExact(self, queryData, GetData, multiprobeR=0):
'''Return a list of results sorted by their exact
distance from the query. GetData is a function that
- returns the original data given its key.'''
- s = self.Find(data, multiprobeR)
+ returns the original data given its key. This function returns
+ a list of results, each result has the candidate ID and distance.'''
+ s = self.Find(queryData, multiprobeR)
# print "Intermediate results are:", s
- d = map(lambda (id,count): (id,((GetData(id)-data)**2).sum(), \
+ d = map(lambda (id,count): (id,((GetData(id)-queryData)**2).sum(), \
count), s)
s = sorted(d, key=operator.itemgetter(1))
return [(self.FindID(i),d) for (i,d,c) in s]
@@ -453,11 +456,12 @@ def GetAllIndices(self):
return None
# Return the buckets (t1 and t2 hashes) associated with a data point
- def GetBuckets(data):
+ def GetBuckets(self, data):
b = []
for p in self.projections:
- h = p.CalculateHashes(data)
- b += h
+ ( t1, t2, bins, parray) = p.CalculateHashes2(data)
+ print "Bucket:", t1, t2, bins, parray
+ b += (t1, t2)
return b
#
@@ -486,7 +490,7 @@ def FindBuckets(self, data):
pi = 0
for p in self.projections:
prefix = self.DictionaryPrefix(pi)
- (t1,t2) = p.CalculateHashes(data)
+ ( t1, t2, bins, parray) = p.CalculateHashes2(data)
word = prefix + str(t1)
theWords += [word]
pi += 1
@@ -758,15 +762,15 @@ def ComputePnnPany(self, w, k, l, multiprobe=0):
sys.stdout.flush()
continue
startQueryTime = time.clock() # Measure CPU time
- matches = self.myIndex.FindExact(queryData, self.RetrieveData, multiprobe)
+ matches = self.myIndex.Find(queryData, multiprobe)
totalQueryTime += time.clock() - startQueryTime
for (m,c) in matches:
if nnKey == m: # See if NN was found!!!
cnn += c
cnnFull += 1
- if m != queryKey:
+ if m != queryKey: # Don't count the query
cany += c
- canyFull += len(matches)-1
+ canyFull += len(matches)-1 # Total candidates minus 1 for query
queryCount += 1
# Some debugging for k curve.. print individual results
# print "ComputePnnPany Debug:", w, k, l, len(matches), numPoints, cnn, cnnFull, cany, canyFull
@@ -1086,16 +1090,16 @@ def OutputAllProjections(myTestData, myTestIndex, filename):
myTestData.LoadNearestNeighbors(defaultFileName + '.nn')
# ComputePnnPanyCurve(myData, [.291032])
kList = [math.floor(math.sqrt(2)**k) for k in range(0,10)]
- kList = [1,2,3,4,5,6,8,10,12,14,16,18,20]
+ kList = [1,2,3,4,5,6,8,10,12,14,16,18,20,22,25,30,35,40]
myTestData.ComputeKCurve(kList, defaultW)
elif arg == '-ltest': # Calculate bucket probabilities as a function of l
random.seed(0)
myTestData = TestDataClass()
- myTestData.LoadData(defaultFileName + 'dat')
+ myTestData.LoadData(defaultFileName + '.dat')
myTestData.LoadNearestNeighbors(defaultFileName + '.nn')
# ComputePnnPanyCurve(myData, [.291032])
lList = [math.floor(math.sqrt(2)**k) for k in range(0,10)]
- lList = [1,2,3,4,5,6,10]
+ lList = [1,2,3,4,5,6,8,10,12,14,16,18,20,22,25,30]
myTestData.ComputeLCurve(lList, w=defaultW, k=10)
elif arg == '-timing':
# sys.argv.pop(0)
Please sign in to comment.
Something went wrong with that request. Please try again.