Skip to content

Commit

Permalink
GoSenSearch
Browse files Browse the repository at this point in the history
  • Loading branch information
yusuke committed Apr 6, 2011
1 parent 5a49e05 commit 2d86071
Show file tree
Hide file tree
Showing 5 changed files with 266 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
@@ -1,4 +1,7 @@
target
cjkindex
gosenindex
dictionary
*.iml
*.ipr
*.gif
Expand Down
12 changes: 11 additions & 1 deletion pom.xml
Expand Up @@ -18,12 +18,22 @@
<artifactId>lucene-analyzers</artifactId>
<version>3.1.0</version>
</dependency>
<dependency>
<groupId>com.github.lucenejapaneseanalyzer</groupId>
<artifactId>japaneseanalyzer</artifactId>
<version>0.0.1-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>gosen</groupId>
<artifactId>gosen</artifactId>
<version>1.0beta</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.8.2</version>
</dependency>
</dependencies>


</project>
176 changes: 176 additions & 0 deletions src/main/resources/japanese-gosen-analyzer.xml
@@ -0,0 +1,176 @@
<?xml version='1.0' encoding='utf-8' ?>

<analyzer>
<!-- specify gosen tokenizer class -->
<tokenizerClass>com.github.lucenejapaneseanalyzer.japaneseanalyzer.GoSenTokenizer</tokenizerClass>

<!-- specify sen tokenizer class
<tokenizerClass>org.apache.lucene.analysis.ja.sen.SenTokenizer</tokenizerClass>
-->
<!--
// use this class for chasen
<tokenizerClass>org.apache.lucene.analysis.ja.chasen.ChasenTokenizer</tokenizerClass>
-->

<!--
#
# Stop word list
#
# An array containing some common English & Japanese words that
# are usually not useful for searching.
#
# Japanese stop words is based on GAEA:
# http://galaga.jaist.ac.jp:8000/pub/tools/GAEA/manual.html
-->
<stop>
<letters><![CDATA[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}]]></letters>
<word>a</word>
<word>and</word>
<word>are</word>
<word>as</word>
<word>at</word>
<word>be</word>
<word>but</word>
<word>by</word>
<word>for</word>
<word>if</word>
<word>in</word>
<word>into</word>
<word>is</word>
<word>it</word>
<word>no</word>
<word>not</word>
<word>of</word>
<word>on</word>
<word>or</word>
<word>s</word>
<word>such</word>
<word>t</word>
<word>that</word>
<word>the</word>
<word>their</word>
<word>then</word>
<word>there</word>
<word>these</word>
<word>they</word>
<word>this</word>
<word>to</word>
<word>was</word>
<word>will</word>
<word>with</word>
<word>いう</word>
<word>する</word>
<word>人物</word>
<word>さま</word>
<word>すること</word>
<word>ため</word>
<word>もの</word>
<word>おいて</word>
<word>なる</word>
<word>できる</word>
<word>おく</word>
<word>ある</word>
</stop>

<!--
#
# Parts of speech list which are used in indexing.
#
# Note: These pos depend on chasen & IPA dictionary.
-->
<accept>
<pos>名詞</pos>
<pos>名詞-一般</pos>
<pos>名詞-固有名詞</pos>
<pos>名詞-固有名詞-一般</pos>
<pos>名詞-固有名詞-人名</pos>
<pos>名詞-固有名詞-人名-一般</pos>
<pos>名詞-固有名詞-人名-姓</pos>
<pos>名詞-固有名詞-人名-名</pos>
<pos>名詞-固有名詞-組織</pos>
<pos>名詞-固有名詞-地域</pos>
<pos>名詞-固有名詞-地域-一般</pos>
<pos>名詞-固有名詞-地域-国</pos>
<pos>名詞-代名詞</pos>
<pos>名詞-代名詞-一般</pos>
<pos>名詞-代名詞-縮約</pos>
<pos>名詞-副詞可能</pos>
<pos>名詞-サ変接続</pos>
<pos>名詞-形容動詞語幹</pos>
<pos>名詞-数</pos>
<pos>名詞-非自立</pos>
<pos>名詞-非自立-一般</pos>
<pos>名詞-非自立-副詞可能</pos>
<pos>名詞-非自立-助動詞語幹</pos>
<pos>名詞-非自立-形容動詞語幹</pos>
<pos>名詞-特殊</pos>
<pos>名詞-特殊-助動詞語幹</pos>
<pos>名詞-接尾</pos>
<pos>名詞-接尾-一般</pos>
<pos>名詞-接尾-人名</pos>
<pos>名詞-接尾-地域</pos>
<pos>名詞-接尾-サ変接続</pos>
<pos>名詞-接尾-助動詞語幹</pos>
<pos>名詞-接尾-形容動詞語幹</pos>
<pos>名詞-接尾-副詞可能</pos>
<pos>名詞-接尾-助数詞</pos>
<pos>名詞-接尾-特殊</pos>
<pos>名詞-接続詞的</pos>
<pos>名詞-動詞非自立的</pos>
<pos>名詞-引用文字列</pos>
<pos>名詞-ナイ形容詞語幹</pos>
<pos>接頭詞</pos>
<pos>接頭詞-名詞接続</pos>
<pos>接頭詞-動詞接続</pos>
<pos>接頭詞-形容詞接続</pos>
<pos>接頭詞-数接続</pos>
<pos>動詞</pos>
<pos>動詞-自立</pos>
<pos>動詞-非自立</pos>
<pos>動詞-接尾</pos>
<pos>形容詞</pos>
<pos>形容詞-自立</pos>
<pos>形容詞-非自立</pos>
<pos>形容詞-接尾</pos>
<pos>副詞</pos>
<pos>副詞-一般</pos>
<pos>副詞-助詞類接続</pos>
<pos>連体詞</pos>
<pos>接続詞</pos>
<pos>助詞</pos>
<pos>助詞-格助詞</pos>
<pos>助詞-格助詞-一般</pos>
<pos>助詞-格助詞-引用</pos>
<pos>助詞-格助詞-連語</pos>
<pos>助詞-接続助詞</pos>
<pos>助詞-係助詞</pos>
<pos>助詞-副助詞</pos>
<pos>助詞-間投助詞</pos>
<pos>助詞-並立助詞</pos>
<pos>助詞-終助詞</pos>
<pos>助詞-副助詞/並立助詞/終助詞</pos>
<pos>助詞-連体化</pos>
<pos>助詞-副詞化</pos>
<pos>助詞-特殊</pos>
<pos>助動詞</pos>
<pos>感動詞</pos>
<pos>記号</pos>

<!-- <pos>記号-一般</pos> -->
<!-- <pos>記号-句点</pos> -->
<!-- <pos>記号-読点</pos> -->
<!-- <pos>記号-空白</pos> -->
<pos>記号-アルファベット</pos>

<!-- <pos>記号-括弧開</pos> -->
<!-- <pos>記号-括弧閉</pos> -->

<pos>その他</pos>
<pos>その他-間投</pos>
<pos>フィラー</pos>
<pos>非言語音</pos>
<pos>語断片</pos>
<pos>未知語</pos>
</accept>

</analyzer>
70 changes: 70 additions & 0 deletions src/test/java/luceneexamples/GoSenSearch.java
@@ -0,0 +1,70 @@
/*
* Copyright 2011 Yusuke Yamamoto
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package luceneexamples;

import com.github.lucenejapaneseanalyzer.japaneseanalyzer.GoSenAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

import java.io.File;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;

/**
* @author Yusuke Yamamoto - yusuke at mac.com
*/
public class GoSenSearch {
@Test
public void index() throws Exception {
System.setProperty("org.apache.lucene.ja.config.file","japanese-gosen-analyzer.xml");
System.setProperty("sen.home","dictionary");

Directory directory = new RAMDirectory();
// Directory directory = FSDirectory.open(new File("gosenindex"));
Analyzer analyzer = new GoSenAnalyzer();

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
IndexWriter writer = new IndexWriter(directory, iwc);

Document doc = new Document();
doc.add(new Field("str_field", "quick brown fox jumped over the lazy dog.",
Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc);
Document doc2 = new Document();
doc2.add(new Field("str_field", "貴社の記者が汽車で帰社した",
Field.Store.YES, Field.Index.ANALYZED));
writer.addDocument(doc2);
writer.close();
IndexSearcher searcher = new IndexSearcher(directory, true);
QueryParser parser = new QueryParser(Version.LUCENE_31, "str_field", analyzer);
TopDocs td = searcher.search(parser.parse("記者"), 1000);
assertThat(td.totalHits, is(1));
searcher.close();
directory.close();
}
}
7 changes: 6 additions & 1 deletion src/test/java/luceneexamples/JapaneseSearch.java
Expand Up @@ -24,10 +24,14 @@
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import org.junit.Test;

import java.io.File;

import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.assertThat;

Expand All @@ -37,7 +41,8 @@
public class JapaneseSearch {
@Test
public void index() throws Exception {
RAMDirectory directory = new RAMDirectory();
Directory directory = new RAMDirectory();
// Directory directory = FSDirectory.open(new File("cjkindex"));
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_31);

IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);
Expand Down

0 comments on commit 2d86071

Please sign in to comment.