Browse files

Implemented weighted search

  • Loading branch information...
1 parent 893faad commit 1f1178717275bb8b6abe94182366157972a1777c Kurt Symanzik committed Aug 21, 2012
Showing with 79 additions and 16 deletions.
  1. +38 −13 lib/reds.js
  2. +41 −3 test/index.js
View
51 lib/reds.js
@@ -39,10 +39,10 @@ exports.Query = Query;
*/
var types = {
- intersect: 'sinter'
- , union: 'sunion'
- , and: 'sinter'
- , or: 'sunion'
+ intersect: 'zinterstore'
+ , union: 'zunionstore'
+ , and: 'zinterstore'
+ , or: 'zunionstore'
};
/**
@@ -117,6 +117,23 @@ exports.stripStopWords = function(words){
};
/**
+ * Returns an object mapping each word in a Array
+ * to the number of times it occurs in the Array.
+ *
+ * @param {Array} words
+ * @return {Object}
+ * @api private
+ */
+
+exports.countWords = function(words){
+ var obj = {};
+ for (var i = 0, len = words.length; i < len; ++i) {
+ obj[words[i]]? obj[words[i]] += 1: obj[words[i]] = 1;
+ }
+ return obj;
+};
+
+/**
* Return the given `words` mapped to the metaphone constant.
*
* Examples:
@@ -222,7 +239,15 @@ Query.prototype.end = function(fn){
, type = this._type;
if (!keys.length) return fn(null, []);
- db[type](keys, fn);
+ var tkey = key + 'tmpkey';
+ db.multi([
+ [type, tkey, keys.length, keys],
+ ['zrevrange', tkey, 0, -1],
+ ['zremrangebyrank', tkey, 0, -1],
+ ]).exec(function(err, ids) {
+ ids = ids[1];
+ fn(err, ids);
+ });
return this;
};
@@ -252,16 +277,16 @@ Search.prototype.index = function(str, id, fn){
var key = this.key
, db = this.client
, words = exports.stem(exports.stripStopWords(exports.words(str)))
+ , counts = exports.countWords(words)
, map = exports.metaphoneMap(words)
- , keys = Object.keys(map)
- , len = keys.length;
+ , keys = Object.keys(map);
- var multi = db.multi();
+ var dbCmds = [];
keys.forEach(function(word, i){
- multi.sadd(key + ':word:' + map[word], id);
- multi.sadd(key + ':object:' + id, map[word]);
+ dbCmds.push(['zadd', key + ':word:' + map[word], counts[word], id]);
+ dbCmds.push(['zadd', key + ':object:' + id, counts[word], map[word]]);
});
- multi.exec(fn || noop);
+ db.multi(dbCmds).exec(fn || noop);
@tj
Owner
tj added a line comment Aug 21, 2012

any specific reason for using the array instead?

@kbsymanz
kbsymanz added a line comment Aug 22, 2012

No great reason. Redis multi sends each command through to the server where it is queued until exec.I thought that if the Redis DB was not local then latency might play in. Since there is no advantage to sending it piecemeal, sending it all as an array at the end eliminates that as a possible issue.

@tj
Owner
tj added a line comment Aug 23, 2012

well multi returns a new Multi which queues the other calls, they're not actually sent right away (I wrote that part of node_redis), I didn't even know you could pass an array, but arrays are a lot uglier IMO

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
return this;
};
@@ -277,11 +302,11 @@ Search.prototype.remove = function(id, fn){
fn = fn || noop;
var key = this.key
, db = this.client;
- db.smembers(key + ':object:' + id, function(err, constants){
+ db.zrevrangebyscore(key + ':object:' + id, '+inf', 0, function(err, constants){
if (err) return fn(err);
var multi = db.multi().del(key + ':object:' + id);
constants.forEach(function(c){
- multi.srem(key + ':word:' + c, id);
+ multi.zrem(key + ':word:' + c, id);
});
multi.exec(fn);
});
View
44 test/index.js
@@ -22,6 +22,15 @@ reds
.should.eql(['just', 'test']);
reds
+ .countWords(['foo', 'bar', 'baz', 'foo', 'jaz', 'foo', 'baz'])
+ .should.eql({
+ foo: 3
+ , bar: 1
+ , baz: 2
+ , jaz: 1
+ });
+
+reds
.metaphoneMap(['foo', 'bar', 'baz'])
.should.eql({
foo: 'F'
@@ -41,15 +50,17 @@ reds
.metaphoneKeys('foobar', ['foo', 'bar', 'baz'])
.should.eql(['foobar:word:F', 'foobar:word:BR', 'foobar:word:BS']);
-
db.flushdb(function(){
search
.index('Tobi wants 4 dollars', 0)
.index('Loki is a ferret', 2)
.index('Tobi is also a ferret', 3)
.index('Jane is a bitchy ferret', 4)
.index('Tobi is employed by LearnBoost', 5, test)
- .index('computing stuff', 6);
+ .index('computing stuff', 6)
+ .index('simple words do not mean simple ideas', 7)
+ .index('The dog spoke the words, much to our unbelief.', 8)
+ .index('puppy dog eagle puppy frog puppy dog simple', 9);
});
function test() {
@@ -159,6 +170,33 @@ function test() {
++pending;
search
+ .query('simple')
+ .end(function(err, ids){
+ if (err) throw err;
+ ids.should.have.length(2);
+ ids.should.include('7');
+ ids.should.include('9');
+ ids[0].should.eql('7');
+ ids[1].should.eql('9');
+ --pending || done();
+ });
+
+ ++pending;
+ search
+ .query('dog ideas')
+ .type('or')
+ .end(function(err, ids){
+ if (err) throw err;
+ ids.should.have.length(3);
+ ids.should.include('7');
+ ids.should.include('8');
+ ids.should.include('9');
+ ids[0].should.eql('9');
+ --pending || done();
+ });
+
+ ++pending;
+ search
.index('keyboard cat', 6, function(err){
if (err) throw err;
search.query('keyboard').end(function(err, ids){
@@ -189,4 +227,4 @@ function done() {
console.log(' tests completed in %dms', new Date - start);
console.log();
process.exit();
-}
+}

3 comments on commit 1f11787

@tj
Owner

it would be nice if the weights were arbitrary, and word-count just happened to be one of the solutions

@kbsymanz

What other weighting were you thinking of? Support for quoted phrases where a matched phrase would weigh higher? Preference for certain source documents over others, say if the documents also had tags associated with that matched the search as well? What other ideas did you have?

@tj
Owner

mostly related to weighing the docs associated with the ids, which could be anything really but I guess you'd always be querying another db anyway so the sorting can be done there

Please sign in to comment.