Permalink
Browse files

Added step 1a

  • Loading branch information...
1 parent 7253bdc commit d0514be188fb6e906b5a83cc89dd4d8cb4df07be @weavejester committed Nov 16, 2011
Showing with 20 additions and 0 deletions.
  1. +8 −0 src/stemmer/snowball/english.clj
  2. +12 −0 src/stemmer/util.clj
@@ -1,7 +1,15 @@
(ns stemmer.snowball.english
"Implementation of the Snowball stemming algorithm for the English language.
See: http://snowball.tartarus.org/algorithms/english/stemmer.html"
+ (:use [stemmer.util :only (replace-longest)])
(:require [clojure.string :as str]))
(defn step-0 [word]
(str/replace word #"('s'|'s|')$" ""))
+
+(defn step-1a [word]
+ (replace-longest word
+ #"sses$" "ss"
+ #"ied$" "ies"
+ #"s$" #(if (re-find #"[aeiouy].*..$" word) "" %)
+ #"us|ss" identity))
View
@@ -0,0 +1,12 @@
+(ns stemmer.util
+ "General utility functions for creating stemmers."
+ (:require [clojure.string :as str]))
+
+(defn longest-match [re s]
+ (if-let [matches (re-seq re s)]
+ (apply max-key count matches)))
+
+(defn replace-longest [s & rules]
+ (->> (partition 2 rules)
+ (apply max-key (fn [[re _]] (count (longest-match re s))))
+ (apply str/replace s)))

0 comments on commit d0514be

Please sign in to comment.