Permalink
Browse files

slight refactor of the porter stemmer

  • Loading branch information...
tj committed Jul 28, 2011
1 parent b13504d commit 86396d5376a3a202a58281bc29f6aefe6a73c56f
Showing with 35 additions and 21 deletions.
  1. +2 −2 benchmarks/index.js
  2. +1 −1 benchmarks/porter-stemmer.js
  3. +32 −18 lib/natural/porter_stemmer.js
View
@@ -35,8 +35,8 @@ suite = new uubench.Suite({
}
});
-require('./metaphone');
-require('./soundex');
+// require('./metaphone');
+// require('./soundex');
require('./porter-stemmer');
suite.run();
@@ -1,6 +1,6 @@
// single word
-
+console.log(stem('counting'));
suite.bench('stem() word', function(next){
stem('counting');
next();
@@ -25,21 +25,26 @@ var Stemmer = require('./stemmer');
// denote groups of consecutive consonants with a C and consecutive vowels
// with a V.
function categorizeGroups(token) {
- return token.replace(/[^aeiou]+/g, 'C').replace(/[aeiouy]+/g, 'V');
+ return token
+ .replace(/[^aeiou]+/g, 'C')
+ .replace(/[aeiouy]+/g, 'V');
}
// denote single consonants with a C and single vowels with a V
function categorizeChars(token) {
- return token.replace(/[^aeiou]/g, 'C').replace(/[aeiouy]/g, 'V');
+ return token
+ .replace(/[^aeiou]/g, 'C')
+ .replace(/[aeiouy]/g, 'V');
}
// calculate the "measure" M of a word. M is the count of VC sequences dropping
// an initial C if it exists and a trailing V if it exists.
function measure(token) {
- if(!token)
- return -1;
-
- return categorizeGroups(token).replace(/^C/, '').replace(/V$/, '').length / 2;
+ if (!token) return -1;
+ return categorizeGroups(token)
+ .replace(/^C/, '')
+ .replace(/V$/, '')
+ .length / 2;
}
// determine if a token end with a double consonant i.e. happ
@@ -145,28 +150,37 @@ function step1c(token) {
// step 2 as defined for the porter stemmer algorithm.
function step2(token) {
- return replacePatterns(token, [['ational', 'ate'], ['tional', 'tion'], ['enci', 'ence'], ['anci', 'ance'],
- ['izer', 'ize'], ['abli', 'able'], ['alli', 'al'], ['entli', 'ent'], ['eli', 'e'],
- ['ousli', 'ous'], ['ization', 'ize'], ['ation', 'ate'], ['ator', 'ate'],['alism', 'al'],
- ['iveness', 'ive'], ['fulness', 'ful'], ['ousness', 'ous'], ['aliti', 'al'],
- ['iviti', 'ive'], ['biliti', 'ble']], 0);
+ return replacePatterns(token, step2.patterns, 0);
}
+step2.patterns =
+ [['ational', 'ate'], ['tional', 'tion'], ['enci', 'ence'], ['anci', 'ance'],
+ ['izer', 'ize'], ['abli', 'able'], ['alli', 'al'], ['entli', 'ent'], ['eli', 'e'],
+ ['ousli', 'ous'], ['ization', 'ize'], ['ation', 'ate'], ['ator', 'ate'],['alism', 'al'],
+ ['iveness', 'ive'], ['fulness', 'ful'], ['ousness', 'ous'], ['aliti', 'al'],
+ ['iviti', 'ive'], ['biliti', 'ble']];
+
// step 3 as defined for the porter stemmer algorithm.
function step3(token) {
- return replacePatterns(token, [['icate', 'ic'], ['ative', ''], ['alize', 'al'],
- ['iciti', 'ic'], ['ical', 'ic'], ['ful', ''], ['ness', '']], 0);
+ return replacePatterns(token, step3.patterns, 0);
}
+step3.patterns =
+ [['icate', 'ic'], ['ative', ''], ['alize', 'al'],
+ ['iciti', 'ic'], ['ical', 'ic'], ['ful', ''], ['ness', '']];
+
// step 4 as defined for the porter stemmer algorithm.
function step4(token) {
- return replacePatterns(token, [['al', ''], ['ance', ''], ['ence', ''], ['er', ''],
- ['ic', ''], ['able', ''], ['ible', ''], ['ant', ''],
- ['ement', ''], ['ment', ''], ['ent', ''], [/([st])ion/, '$1'], ['ou', ''], ['ism', ''],
- ['ate', ''], ['iti', ''], ['ous', ''], ['ive', ''],
- ['ize', '']], 1);
+ return replacePatterns(token, step4.patterns, 1);
}
+step4.patterns =
+ [['al', ''], ['ance', ''], ['ence', ''], ['er', ''],
+ ['ic', ''], ['able', ''], ['ible', ''], ['ant', ''],
+ ['ement', ''], ['ment', ''], ['ent', ''], [/([st])ion/, '$1'], ['ou', ''], ['ism', ''],
+ ['ate', ''], ['iti', ''], ['ous', ''], ['ive', ''],
+ ['ize', '']];
+
// step 5a as defined for the porter stemmer algorithm.
function step5a(token) {
var m = measure(token);

0 comments on commit 86396d5

Please sign in to comment.