Skip to content

Commit

Permalink
Fix bugs, add support for using paper-style algo
Browse files Browse the repository at this point in the history
Closes GH-8.

Co-authored-by: Hopper262 <hopper@whpress.com>
  • Loading branch information
wooorm and Hopper262 committed Jun 20, 2023
1 parent 870cb4b commit 78b0fe9
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 24 deletions.
79 changes: 61 additions & 18 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,47 @@
const stop = -1
const intact = 0
const cont = 1
const protect = 2
const vowels = /[aeiouy]/

/**
* @typedef {Object} RuleSet
* @property {string} match
* @property {string} replacement
* @property {number} type
*
* @typedef {Record<string, Array<RuleSet>>} RuleCollection
*
* @typedef {'c' | 'paper'} Style
* Style of algorithm.
*
* There are small algorithmic differences between how the algorithm was
* implemented over the years.
* Looking at [Algorithm Implementations][algos] on the archived website,
* there are four styles available, in addition to the original paper.
*
* The only difference currently implemented in this package is whether a
* final `s` is kept before stopping (`paper`) or dropped before stopping
* (`c`).
*
* ###### Values
*
* * `'c'`
* — rules from the ANSI C (Stark, 1994) and Perl (Taffet, 2001)
* implementations (`compensation` -> `compen`)
* * `'paper'`
* — rules from the original paper (1990), and Pascal (Paice/Husk) and
* Java (O’Neill, 2000) implementations (`compensation` -> `compens`)
*
* @typedef Options
* Configuration.
* @property {Style | null | undefined} [style='c']
* Style of algorithm (default: `'c'`).
*/

/** @type {Record<string, Array<RuleSet>>} */
const rules = {
const stop = -1
const intact = 0
const cont = 1
const protect = 2
const contint = 3
const vowels = /[aeiouy]/

/** @type {RuleCollection} */
const rulesPaper = {
a: [
{match: 'ia', replacement: '', type: intact},
{match: 'a', replacement: '', type: intact}
Expand Down Expand Up @@ -101,7 +130,6 @@ const rules = {
{match: 'er', replacement: '', type: cont},
{match: 'ear', replacement: '', type: protect},
{match: 'ar', replacement: '', type: stop},
{match: 'ior', replacement: '', type: cont},
{match: 'or', replacement: '', type: cont},
{match: 'ur', replacement: '', type: cont},
{match: 'rr', replacement: 'r', type: stop},
Expand All @@ -116,8 +144,10 @@ const rules = {
{match: 'ss', replacement: '', type: protect},
{match: 'ous', replacement: '', type: cont},
{match: 'us', replacement: '', type: intact},
{match: 's', replacement: '', type: cont},
{match: 's', replacement: '', type: stop}
{match: 's', replacement: '', type: contint},
// Note: this following rule is mutated for the C set, be careful when
// touching it.
{match: 's', replacement: '', type: protect}
],
t: [
{match: 'plicat', replacement: 'ply', type: stop},
Expand Down Expand Up @@ -169,25 +199,38 @@ const rules = {
]
}

/** @type {RuleCollection} */
const rulesC = JSON.parse(JSON.stringify(rulesPaper))
rulesC.s[8].type = stop

/**
* Get the stem from a given value.
*
* @param {string} value
* Value to stem.
* @param {Options | null | undefined} [options]
* Configuration.
* @returns {string}
* Stem for `value`.
*/
export function lancasterStemmer(value) {
return applyRules(String(value).toLowerCase(), true)
export function lancasterStemmer(value, options) {
const settings = options || {}
const style = settings.style

return applyRules(
String(value).toLowerCase(),
true,
style === 'paper' ? rulesPaper : rulesC
)
}

/**
* @param {string} value
* @param {boolean} isIntact
* @param {RuleCollection} rules
* @returns {string}
*/
function applyRules(value, isIntact) {
/** @type {Array<RuleSet>} */
function applyRules(value, isIntact, rules) {
const ruleset = rules[value.charAt(value.length - 1)]
let index = -1

Expand All @@ -198,7 +241,7 @@ function applyRules(value, isIntact) {
while (++index < ruleset.length) {
const rule = ruleset[index]

if (!isIntact && rule.type === intact) {
if (!isIntact && (rule.type === intact || rule.type === contint)) {
continue
}

Expand All @@ -218,8 +261,8 @@ function applyRules(value, isIntact) {
continue
}

if (rule.type === cont) {
return applyRules(next, false)
if (rule.type === cont || rule.type === contint) {
return applyRules(next, false, rules)
}

return next
Expand Down
51 changes: 46 additions & 5 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
* [Install](#install)
* [Use](#use)
* [API](#api)
* [`lancasterStemmer(value)`](#lancasterstemmervalue)
* [`lancasterStemmer(value, options?)`](#lancasterstemmervalue-options)
* [`Options`](#options)
* [`Style`](#style)
* [CLI](#cli)
* [Types](#types)
* [Compatibility](#compatibility)
Expand Down Expand Up @@ -78,19 +80,51 @@ lancasterStemmer('analytic') === lancasterStemmer('AnAlYtIc') // => true
This package exports the identifier [`lancasterStemmer`][api-lancasterstemmer].
There is no default export.

### `lancasterStemmer(value)`
### `lancasterStemmer(value, options?)`

Get the stem from a given value.

###### Parameters

* `value`(`string`, required)
— value to stem
* `options`([`Options`][api-options], optional)
— configuration

##### Returns

Stem for `value` (`string`).

### `Options`

Configuration (TypeScript type).

###### Fields

* `style` ([`Style`][api-style], default: `'c'`)
— style of algorithm

### `Style`

Style of algorithm (TypeScript type).

There are small algorithmic differences between how the algorithm was
implemented over the years.
Looking at [Algorithm Implementations][algos] on the archived website,
there are four styles available, in addition to the original paper.

The only difference currently implemented in this package is whether a final
`s` is kept before stopping (`paper`) or dropped before stopping (`c`).

###### Values

* `'c'`
— rules from the ANSI C (Stark, 1994) and Perl (Taffet, 2001)
implementations (`compensation` -> `compen`)
* `'paper'`
— rules from the original paper (1990), and Pascal (Paice/Husk) and
Java (O’Neill, 2000) implementations (`compensation` -> `compens`)

## CLI

```txt
Expand All @@ -117,7 +151,8 @@ detest vil
## Types

This package is fully typed with [TypeScript][].
It exports no additional types.
It exports the additional types [`Options`][api-options] and
[`Style`][api-style].

## Compatibility

Expand Down Expand Up @@ -185,6 +220,12 @@ This package is safe.

[author]: https://wooorm.com

[source]: https://web.archive.org/web/20150215002618/http://www.comp.lancs.ac.uk:80/computing/research/stemming/index.htm
[source]: https://web.archive.org/web/20150215002618/http://www.comp.lancs.ac.uk/computing/research/stemming/index.htm

[algos]: https://web.archive.org/web/20060819173645/http://www.comp.lancs.ac.uk/computing/research/stemming/Links/implementations.htm

[api-lancasterstemmer]: #lancasterstemmervalue-options

[api-options]: #options

[api-lancasterstemmer]: #lancasterstemmervalue
[api-style]: #style
14 changes: 14 additions & 0 deletions test.js
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,20 @@ test('api', function () {
assert.ok(!m('showbiz').endsWith('iz'), 'should drop iz$')

assert.ok(m('agryze').endsWith('ys'), 'should transform yz$ into ys')

assert.equal(
m('compensation', {style: 'paper'}),
'compens',
"should support `style: 'paper'`"
)

assert.equal(
m('compensation', {style: 'c'}),
'compen',
"should support `style: 'c'`"
)

assert.equal(m('compensation'), 'compen', "should default to `style: '1994'`")
})

test('cli', async function () {
Expand Down
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
"target": "es2020"
},
"include": ["**/*.js"],
"exclude": ["coverage/", "lib/", "node_modules/", "index.js"]
"exclude": ["coverage/", "lib/", "node_modules/"]
}

0 comments on commit 78b0fe9

Please sign in to comment.