Permalink
Browse files

Moving text mining classes over from akela

  • Loading branch information...
1 parent c005a86 commit a482ab202f5e1e572599f4353e07c78569c364b5 @xstevens xstevens committed Aug 3, 2011
Showing with 2,846 additions and 11 deletions.
  1. +1 −0 language-profiles/af
  2. +1 −0 language-profiles/ar
  3. +1 −0 language-profiles/bg
  4. +1 −0 language-profiles/bn
  5. +1 −0 language-profiles/cs
  6. +1 −0 language-profiles/da
  7. +1 −0 language-profiles/de
  8. +1 −0 language-profiles/el
  9. +1 −0 language-profiles/en
  10. +1 −0 language-profiles/es
  11. +1 −0 language-profiles/fa
  12. +1 −0 language-profiles/fi
  13. +1 −0 language-profiles/fr
  14. +1 −0 language-profiles/gu
  15. +1 −0 language-profiles/he
  16. +1 −0 language-profiles/hi
  17. +1 −0 language-profiles/hr
  18. +1 −0 language-profiles/hu
  19. +1 −0 language-profiles/id
  20. +1 −0 language-profiles/it
  21. +1 −0 language-profiles/ja
  22. +1 −0 language-profiles/kn
  23. +1 −0 language-profiles/ko
  24. +1 −0 language-profiles/mk
  25. +1 −0 language-profiles/ml
  26. +1 −0 language-profiles/mr
  27. +1 −0 language-profiles/ne
  28. +1 −0 language-profiles/nl
  29. +1 −0 language-profiles/no
  30. +1 −0 language-profiles/pa
  31. +1 −0 language-profiles/pl
  32. +1 −0 language-profiles/pt
  33. +1 −0 language-profiles/ro
  34. +1 −0 language-profiles/ru
  35. +1 −0 language-profiles/sk
  36. +1 −0 language-profiles/so
  37. +1 −0 language-profiles/sq
  38. +1 −0 language-profiles/sv
  39. +1 −0 language-profiles/sw
  40. +1 −0 language-profiles/ta
  41. +1 −0 language-profiles/te
  42. +1 −0 language-profiles/th
  43. +1 −0 language-profiles/tl
  44. +1 −0 language-profiles/tr
  45. +1 −0 language-profiles/uk
  46. +1 −0 language-profiles/ur
  47. +1 −0 language-profiles/vi
  48. +1 −0 language-profiles/zh-cn
  49. +1 −0 language-profiles/zh-tw
  50. BIN lib/akela-0.2-SNAPSHOT.jar
  51. BIN lib/jsonic-1.2.0.jar
  52. BIN lib/langdetect.jar
  53. BIN lib/opencloud-0.2.jar
  54. +44 −0 pom.xml
  55. +139 −0 src/main/java/com/mozilla/grouperfish/lucene/analysis/en/EnglishAnalyzer.java
  56. +118 −0 src/main/java/com/mozilla/grouperfish/lucene/analysis/en/NGramEnglishAnalyzer.java
  57. +594 −0 src/main/java/com/mozilla/grouperfish/lucene/analysis/en/ShingleAllStopFilter.java
  58. +64 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/kmeans/DisplayKMeansBase.java
  59. +156 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/kmeans/OriginalText.java
  60. +147 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/kmeans/WordCloud.java
  61. +140 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/lda/DisplayLDABase.java
  62. +229 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/lda/DisplayLDATopics.java
  63. +191 −0 src/main/java/com/mozilla/grouperfish/mahout/clustering/display/lda/OriginalText.java
  64. +102 −0 src/main/java/com/mozilla/grouperfish/pig/eval/ml/TFIDFVectorizer.java
  65. +102 −0 src/main/java/com/mozilla/grouperfish/pig/eval/ml/TFVectorizer.java
  66. +97 −0 src/main/java/com/mozilla/grouperfish/pig/eval/ml/Vectorizer.java
  67. +107 −0 src/main/java/com/mozilla/grouperfish/pig/eval/text/NGramTokenize.java
  68. +64 −0 src/main/java/com/mozilla/grouperfish/pig/eval/text/TermFrequency.java
  69. +104 −0 src/main/java/com/mozilla/grouperfish/pig/eval/text/Tokenize.java
  70. +106 −0 src/main/java/com/mozilla/grouperfish/pig/storage/DocumentVectorStorage.java
  71. +168 −0 src/main/java/com/mozilla/grouperfish/text/Dictionary.java
  72. +118 −0 src/main/java/com/mozilla/grouperfish/text/filter/LanguageFilter.java
  73. +7 −11 src/main/pig/generate_feature_index.pig
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
@@ -0,0 +1 @@
+{"freq":{"D":3892,"E":3299,"F":3285,"G":3045,"A":6416,"B":4070,"C":6617,"L":3054,"M":5299,"N":2815,"O":2578,"H":2337,"I":4798,"J":1557,"K":2201,"U":1696,"T":4435,"W":1804,"V":1999,"P":5115,"S":7482,"R":3099,"Y":511,"X":864,"Z":322,"f":4104,"g":6365,"d":8971,"e":33474,"b":3763,"c":11043,"a":29207,"n":24208,"o":24326,"l":15315,"m":9717,"j":569,"k":3352,"h":9005,"i":26119,"w":2253,"v":3158,"u":10429,"t":20234,"s":16203,"r":23102,"q":406,"p":6165,"z":1050,"y":5016,"x":1132,"·":3643,"é":335,"α":340," l":537," m":1211," n":447," o":1533," h":426," i":705," k":705," d":1030," e":627," f":736," g":373," a":1340," b":551," c":1312," t":1282," v":337," p":1037," s":1367," r":515," J":1393," K":1837," H":1901," I":2769," N":1730," O":1347," L":1992," M":3624," B":2598," C":4248," A":3943," F":2209," G":2328," D":2552," E":1916," Y":379," X":512,"и":342,"о":346," S":4739," R":2101," P":3310,"а":357," W":1489," V":980," U":1053," T":2804," ·":608,"가가》":1806,"A ":1090,"F ":430,"Da":387,"Co":1100,"Ch":633,"FA":344,"G ":347,"De":463,"Di":385,"Ge":351,"I ":810,"Fr":387,"B ":341,"C ":1323,"BS":411,"Ar":322,"D ":656,"Ba":510,"An":388,"Al":410,"Br":354,"Ca":585,"E ":441,"Be":332,"Bo":326,"Le":342,"Li":387,"N ":341,"La":422,"Lo":321,"Me":417,"Mi":434,"O ":403,"Ma":994,"Mo":474,"Ne":370,"Na":336,"P ":715,"Gr":424,"Ha":385,"He":353,"II":481,"In":800,"Ja":347,"L ":466,"Jo":379,"M ":639,"Un":351,"Tr":351,"Th":790,"Te":335,"V ":541,"St":667,"TV":438,"Wi":435,"Pr":609,"S ":1197,"Pa":670,"R ":385,"Se":487,"Sc":370,"So":342,"U ":356,"Sa":396,"Re":588,"Ro":375,"T ":528,"b ":354,"a ":3271,"i ":1146,"ge":1157,"ga":443,"ff":332,"fi":424,"fo":504,"Int":339,"he":2310,"ha":1401,"gi":481,"gh":427,"gu":409,"gr":369,"g ":1575,"ea":1479,"ec":1049,"ed":1247,"de":1816,"di":1071,"do":684,"dr":338,"ew":384,"ev":385,"h ":1328,"fe":378,"eg":405,"ee":580,"el":1725,"ei":535,"ep":401,"eo":702,"en":3472,"em":983,"et":1358,"es":2573,"er":5897,"ca":1157,"e ":7980,"bo":372,"bl":393,"bi":443,"be":654,"da":619,"f ":1427,"cu":340,"ct":1002,"co":1189,"ck":563,"ci":913,"ch":1939,"ce":1477,"c ":925,"ay":495,"ba":511,"d ":2871,"at":3499,"as":1398,"ar":3373,"av":444,"au":580,"al":3307,"ai":767,"ap":659,"am":1381,"an":5017,"ac":1106,"ad":861,"ab":469,"ag":721,"ae":329,"nt":2416,"ns":1211,"no":896,"nn":583,"of":1300,"oc":729,"od":611,"ob":397,"om":1422,"on":5415,"ol":1638,"og":510,"ot":772,"os":950,"ov":494,"ou":1250,"op":740,"oo":649,"or":3530,"r ":3525,"ow":507,"pe":982,"pa":741,"pl":372,"po":651,"ph":572,"pi":450,"lo":1098,"ll":1500,"lu":455,"lt":410,"ly":332,"o ":1739,"ma":1374,"mb":376,"me":1635,"mi":890,"mm":492,"mp":610,"mo":691,"mu":332,"p ":731,"na":1797,"nc":1140,"nd":2000,"ne":1878,"ng":2212,"ni":1786,"ke":469,"m ":2197,"km":481,"li":1968,"le":2422,"ld":489,"la":1786,"n ":6058,"ht":434,"hu":339,"hi":1040,"ho":848,"id":750,"ic":2901,"ia":1671,"ig":816,"if":357,"ie":1267,"k ":934,"ir":791,"is":2128,"it":2101,"iu":388,"iv":722,"il":1499,"im":625,"in":4140,"io":2942,"ip":462,"l ":3328,"y ":2799,"wa":428,"vi":761,"ve":1364,"va":525,"x ":549,"ui":350,"ul":813,"ue":443,"ur":1440,"us":1642,"ut":842,"um":757,"un":1114,"up":321,"ty":669,"tu":765,"tt":612,"ua":440,"uc":387,"w ":493,"to":1445,"ts":387,"tr":1222,"te":3099,"ti":3945,"th":1754,"ta":1709,"su":334,"ss":1114,"st":2539,"sp":332,"so":726,"sc":476,"se":1269,"sh":547,"si":1432,"u ":440,"sa":574,"rr":396,"rs":871,"rt":1127,"ru":469,"ry":772,"ro":2074,"rn":743,"rm":593,"rl":533,"rk":352,"ri":2784,"rg":488,"re":2795,"rd":807,"rc":621,"ra":2892,"t ":3426,"s ":5482,"pr":409,"ys":349,"丞丞 ":694,"丞一 ":1816,"》가":1345,"《가":2228,"丞丕 ":551,"Com":345,"アアア":475,"》가 ":1068,"一一一":5909,"一一丞":1736,"一一丕":1309,"丕一 ":1155,"Pro":348," ·가":327,"가·가":2886," 《":2439," 》":343," 〈":356,"あ":978," 가 ":130222,"》":2546,"《":2558,"〉":392,"〈":395,"ア":871,"丕丞 ":472,"一丞丞":570,"丕丕 ":348,"가가 ":783362,"一丞一":1754,"一丞丕":441,"가가·":2619,"一丕一":1043,"一丕丞":389,"丕丞一":339,"両":634,"丞":11346,"丐":1267,"丕":7734,"一":36326,"丞一":4590,"丞丕":1151,"丞丞":1515,"丕丞":1076,"丕丕":711,"丕一":2824,"丐一":586,"一丕":3304,"一丐":489,"一丞":5004,"一一":15752," 丞":3275," 丐":491," 丕":2250," 一":11320,"The":618,"丞一丕":423,"丞一丞":536,"丞一一":1649,"ああ":644," 一 ":651,"ber":375,"ce ":804,"al ":1627,"ant":323,"ang":422,"anc":360,"and":945,"ame":329,"all":348,"an ":1234,"ard":453,"ari":335,"art":327,"ar ":393,"ate":490,"ati":1694,"アア":629,"가》":1921,"丕一一":978,"丕一丞":350,"ity":425,"ist":505,"ive":498,"is ":521,"ion":2383,"가가가":1019221,"》 ":1160," Ge":348," Fr":383," Ha":384," He":345," Gr":419," Ja":344," In":792,"har":332," Jo":377," La":409," Le":335," Li":369," Ma":978," Mi":422," Me":403,"he ":1035," Ne":354," Na":322," Mo":461,"her":322," An":382," Al":401," Ba":502," Be":321," Br":346," Ca":565," Ch":622," Co":1074," Da":378," Di":376," De":454," Wi":415," Pa":649," Ro":366," Re":579," Pr":602," St":642," Th":749," Te":327," Tr":341," Sa":393," Sc":357," Se":468," So":330," Un":342,"ian":427," in":405,"ic ":729,"ia ":540," 丞丞":589," 丞一":1845," of":1120," 丞丕":471,"ich":356," km":461,"ica":619," an":470," 가가":780590,"ine":530,"ing":1131," 丕丞":431," co":491,"in ":695," 丕一":1167,"ill":401," de":521," 一丞":2175," 一丕":1192," 一一":6508," th":680,"est":422,"ess":440,"er ":2043,"es ":1105,"eri":458,"era":435,"et ":385,"ers":564,"ern":454,"en ":605,"ell":389,"enc":362,"ent":998,"el ":452,"ge ":450,"for":417," 《가":2128,"de ":544,"cti":343,"ch ":513,"che":357,"ed ":647,"ect":433,"·가가":3042,"der":419,"rea":453,"re ":598,"rch":365,"rd ":425,"rat":517,"ran":534,"ric":460,"ry ":656,"rt ":353," 가":911452,"se ":376,"st ":511,"ss ":355,"ste":469,"sti":465,"str":431,"te ":613,"가":2734665,"per":399,"ng ":1232,"nce":521,"ne ":683,"nal":636,"nd ":828,"가·":2913,"가 ":919426,"nte":685,"nt ":696,"ns ":350,"m가":495,"of ":1107,"or ":608,"ore":443,"on ":2671,"ona":592,"ons":366,"le ":899,"lan":381,"A가":441,"lli":322,"ll ":408,"S가":359,"man":401,"丞丕一":331,"men":485,"C가":415,"A가 ":326,"丞丞一":486,"·가":3221,"《가가":2147,"ver":541,"ve ":360,"가》가":891,"us ":933,"um ":349,"ty ":554,"tra":420,"tor":430,"tin":357,"tio":1917,"tic":456,"th ":330,"ter":1180,"the":803,"一丞 ":2036,"一丕 ":1523,"一一 ":6123,"丞 ":3597,"丐 ":360,"丕 ":2805,"가가":1808522,"가》 ":998,"一 ":10399,"あああ":427},"n_words":[3186351,4098367,2871816],"name":"ko"}
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View

Large diffs are not rendered by default.

Oops, something went wrong.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
Binary file not shown.
View
44 pom.xml
@@ -155,6 +155,49 @@
</dependency>
<!-- End of logging configuration. -->
+ <dependency>
+ <groupId>cybozulabs</groupId>
+ <artifactId>langdetect</artifactId>
+ <version>05-09-2011</version>
+ <scope>system</scope>
+ <systemPath>${basedir}/lib/langdetect.jar</systemPath>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-core</artifactId>
+ <version>0.5</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.mahout</groupId>
+ <artifactId>mahout-utils</artifactId>
+ <version>0.5</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.pig</groupId>
+ <artifactId>pig</artifactId>
+ <version>0.8.0-cdh3u0</version>
+ </dependency>
+
+ <!-- OpenCloud -->
+ <dependency>
+ <groupId>org.mcavallo</groupId>
+ <artifactId>opencloud</artifactId>
+ <version>0.2</version>
+ <scope>system</scope>
+ <systemPath>${basedir}/lib/opencloud-0.2.jar</systemPath>
+ </dependency>
+
+ <dependency>
+ <groupId>com.mozilla.metrics</groupId>
+ <artifactId>akela</artifactId>
+ <version>0.2-SNAPSHOT</version>
+ <scope>system</scope>
+ <systemPath>${basedir}/lib/akela-0.2-SNAPSHOT.jar</systemPath>
+ </dependency>
+
+
</dependencies>
<build>
@@ -189,6 +232,7 @@
<artifactId>maven-jar-plugin</artifactId>
<version>2.3.1</version>
<configuration>
+ <finalName>${project.name}-${project.version}</finalName>
<archive>
<manifest>
<addClasspath>true</addClasspath>
@@ -0,0 +1,139 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.mozilla.grouperfish.lucene.analysis.en;
+
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.util.Version;
+
+/**
+ * {@link Analyzer} for English.
+ */
+public final class EnglishAnalyzer extends StopwordAnalyzerBase {
+
+ private final boolean stem;
+ private final Set<?> stemExclusionSet;
+
+ /**
+ * Returns an unmodifiable instance of the default stop words set.
+ *
+ * @return default stop words set.
+ */
+ public static Set<?> getDefaultStopSet() {
+ return DefaultSetHolder.DEFAULT_STOP_SET;
+ }
+
+ /**
+ * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer
+ * class accesses the static final set the first time.;
+ */
+ private static class DefaultSetHolder {
+ static final Set<?> DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET;
+ }
+
+ /**
+ * Builds an analyzer with the default stop words:
+ * {@link #getDefaultStopSet}.
+ */
+ public EnglishAnalyzer(Version matchVersion) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, false);
+ }
+
+ /**
+ * @param matchVersion
+ * @param stem
+ */
+ public EnglishAnalyzer(Version matchVersion, boolean stem) {
+ this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET, stem);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words.
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, boolean stem) {
+ this(matchVersion, stopwords, stem, CharArraySet.EMPTY_SET);
+ }
+
+ /**
+ * Builds an analyzer with the given stop words. If a non-empty stem
+ * exclusion set is provided this analyzer will add a
+ * {@link KeywordMarkerFilter} before stemming.
+ *
+ * @param matchVersion
+ * lucene compatibility version
+ * @param stopwords
+ * a stopword set
+ * @param stemExclusionSet
+ * a set of terms not to be stemmed
+ */
+ public EnglishAnalyzer(Version matchVersion, Set<?> stopwords, boolean stem, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stem = stem;
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ /**
+ * Creates a
+ * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
+ * which tokenizes all the text in the provided {@link Reader}.
+ *
+ * @return A
+ * {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
+ * built from an {@link StandardTokenizer} filtered with
+ * {@link StandardFilter}, {@link LowerCaseFilter},
+ * {@link StopFilter} , {@link KeywordMarkerFilter} if a stem
+ * exclusion set is provided and {@link PorterStemFilter}.
+ */
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ // prior to this we get the classic behavior, standardfilter does it for
+ // us.
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ result = new EnglishPossessiveFilter(result);
+ }
+ result = new LowerCaseFilter(matchVersion, result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ if (stem) {
+ if (!stemExclusionSet.isEmpty())
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ result = new PorterStemFilter(result);
+ result = new StopFilter(matchVersion, result, stopwords);
+ }
+ return new TokenStreamComponents(source, result);
+ }
+}
@@ -0,0 +1,118 @@
+/*
+ * Copyright 2011 Mozilla Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.mozilla.grouperfish.lucene.analysis.en;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Set;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.KeywordMarkerFilter;
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopwordAnalyzerBase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.standard.StandardFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.util.Version;
+
+import com.mozilla.grouperfish.text.Dictionary;
+
+public class NGramEnglishAnalyzer extends StopwordAnalyzerBase {
+
+ private final Set<?> stemExclusionSet;
+ private boolean stem = false;
+ private boolean outputUnigrams = true;
+ private int minNGram = ShingleAllStopFilter.DEFAULT_MIN_SHINGLE_SIZE;
+ private int maxNGram = ShingleAllStopFilter.DEFAULT_MAX_SHINGLE_SIZE;
+
+ public NGramEnglishAnalyzer(Version version) {
+ this(version, StandardAnalyzer.STOP_WORDS_SET, false);
+ }
+
+ public NGramEnglishAnalyzer(Version version, boolean stem) {
+ this(version, StandardAnalyzer.STOP_WORDS_SET, stem);
+ }
+
+ public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem) {
+ this(version, stopwords, stem, true);
+ }
+
+ public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams) {
+ this(version, stopwords, stem, outputUnigrams, ShingleAllStopFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleAllStopFilter.DEFAULT_MAX_SHINGLE_SIZE, CharArraySet.EMPTY_SET);
+ }
+
+ public NGramEnglishAnalyzer(Version version, Set<?> stopwords, boolean stem, boolean outputUnigrams, int minNGram, int maxNGram) {
+ this(version, stopwords, stem, outputUnigrams, minNGram, maxNGram, CharArraySet.EMPTY_SET);
+ }
+
+ public NGramEnglishAnalyzer(Version matchVersion, Set<?> stopwords, boolean stem, boolean outputUnigrams, int minNGram, int maxNGram, Set<?> stemExclusionSet) {
+ super(matchVersion, stopwords);
+ this.stem = stem;
+ this.outputUnigrams = outputUnigrams;
+ this.minNGram = minNGram;
+ this.maxNGram = maxNGram;
+ this.stemExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(matchVersion, stemExclusionSet));
+ }
+
+ @Override
+ protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+ final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+ TokenStream result = new StandardFilter(matchVersion, source);
+ if (matchVersion.onOrAfter(Version.LUCENE_31)) {
+ result = new EnglishPossessiveFilter(result);
+ }
+ result = new LowerCaseFilter(matchVersion, result);
+ ShingleAllStopFilter sf = new ShingleAllStopFilter(result, minNGram, maxNGram, stopwords);
+ sf.setOutputUnigrams(outputUnigrams);
+ if (!outputUnigrams) {
+ sf.setOutputUnigramsIfNoShingles(false);
+ }
+ result = sf;
+
+ if (stem) {
+ if (!stemExclusionSet.isEmpty()) {
+ result = new KeywordMarkerFilter(result, stemExclusionSet);
+ }
+ result = new PorterStemFilter(result);
+ }
+
+ return new TokenStreamComponents(source, result);
+ }
+
+ public static void main(String[] args) throws IOException {
+ Set<String> stopwords = Dictionary.loadDictionary(new Path("file:///Users/xstevens/workspace/akela/stopwords-en.txt"));
+ NGramEnglishAnalyzer analyzer = new com.mozilla.grouperfish.lucene.analysis.en.NGramEnglishAnalyzer(Version.LUCENE_31, stopwords, false, true);
+ TokenStream stream = analyzer.tokenStream("", new StringReader("When I was growing up this was so much fun."));
+ CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
+ while (stream.incrementToken()) {
+ if (termAttr.length() > 0) {
+ System.out.println(termAttr.toString());
+ termAttr.setEmpty();
+ }
+ }
+ }
+}
Oops, something went wrong.

0 comments on commit a482ab2

Please sign in to comment.