From 1262af9f851e9db851f869c311c0726fa3b9484a Mon Sep 17 00:00:00 2001 From: Jonathan Feinberg Date: Wed, 2 Dec 2009 15:32:36 -0500 Subject: [PATCH] NGramIterator now takes optional StopWords to exclude n-grams containing stop words --- src/cue/lang/NGramIterator.java | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/cue/lang/NGramIterator.java b/src/cue/lang/NGramIterator.java index f349cd3..783a3d8 100644 --- a/src/cue/lang/NGramIterator.java +++ b/src/cue/lang/NGramIterator.java @@ -20,6 +20,8 @@ import java.util.Locale; import java.util.NoSuchElementException; +import cue.lang.stop.StopWords; + /** * Construct with a {@link String}, some integer n, and a {@link Locale}; * retrieve a sequence of {@link String}s, each of which has n words @@ -66,6 +68,7 @@ public class NGramIterator extends IterableText private final SentenceIterator sentenceIterator; private final LinkedList grams = new LinkedList(); private final int n; + private final StopWords stopWords; private String next; private Iterator currentWordIterator; @@ -76,9 +79,16 @@ public NGramIterator(final int n, final String text) } public NGramIterator(final int n, final String text, final Locale locale) + { + this(n, text, locale, null); + } + + public NGramIterator(final int n, final String text, final Locale locale, + final StopWords stopWords) { this.n = n; this.sentenceIterator = new SentenceIterator(text, locale); + this.stopWords = stopWords; loadNext(); } @@ -123,11 +133,11 @@ private void loadNext() .iterator(); for (int i = 0; currentWordIterator.hasNext() && i < n - 1; i++) { - grams.add(currentWordIterator.next()); + maybeAddWord(); } } // now grams has n-1 words in it and currentWordIterator hasNext - grams.add(currentWordIterator.next()); + maybeAddWord(); } final StringBuilder sb = new StringBuilder(); for (final String gram : grams) @@ -141,6 +151,19 @@ private void loadNext() next = sb.toString(); } + private void maybeAddWord() + { + final String nextWord = currentWordIterator.next(); + if (stopWords != null && stopWords.isStopWord(nextWord)) + { + grams.clear(); + } + else + { + grams.add(nextWord); + } + } + public static void main(final String[] args) { final String lyric = "This happened once before. I came to your door. No reply.";