Skip to content

Commit

Permalink
升级lucene,从4.10.4到5.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 6, 2015
1 parent 89947eb commit 50ace13
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 22 deletions.
Expand Up @@ -106,7 +106,7 @@ public String name() {
} }
@Override @Override
public Tokenizer create(Reader reader) { public Tokenizer create(Reader reader) {
return new ChineseWordTokenizer(reader, tokenizerSegmentation); return new ChineseWordTokenizer(tokenizerSegmentation);
} }
})); }));
} }
Expand Down
Expand Up @@ -57,6 +57,6 @@ public ChineseWordTokenizerFactory(Index index, @IndexSettings Settings indexSet
} }
@Override @Override
public Tokenizer create(Reader reader) { public Tokenizer create(Reader reader) {
return new ChineseWordTokenizer(reader, segmentation); return new ChineseWordTokenizer(segmentation);
} }
} }
14 changes: 11 additions & 3 deletions src/main/java/org/apdplat/word/lucene/ChineseWordAnalyzer.java
Expand Up @@ -21,7 +21,7 @@
package org.apdplat.word.lucene; package org.apdplat.word.lucene;


import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
Expand Down Expand Up @@ -52,28 +52,35 @@ public ChineseWordAnalyzer(Segmentation segmentation) {
} }


@Override @Override
protected TokenStreamComponents createComponents(String string, Reader reader) { protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new ChineseWordTokenizer(reader, segmentation); Tokenizer tokenizer = new ChineseWordTokenizer(segmentation);
return new TokenStreamComponents(tokenizer); return new TokenStreamComponents(tokenizer);
} }


public static void main(String args[]) throws IOException { public static void main(String args[]) throws IOException {
Analyzer analyzer = new ChineseWordAnalyzer(); Analyzer analyzer = new ChineseWordAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者"); TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
tokenStream.reset();
while(tokenStream.incrementToken()){ while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement()); LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement());
} }
tokenStream.close();

tokenStream = analyzer.tokenStream("text", "word是一个中文分词项目,作者是杨尚川,杨尚川的英文名叫ysc"); tokenStream = analyzer.tokenStream("text", "word是一个中文分词项目,作者是杨尚川,杨尚川的英文名叫ysc");
tokenStream.reset();
while(tokenStream.incrementToken()){ while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class); PositionIncrementAttribute positionIncrementAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);
LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement()); LOGGER.info(charTermAttribute.toString()+" ("+offsetAttribute.startOffset()+" - "+offsetAttribute.endOffset()+") "+positionIncrementAttribute.getPositionIncrement());
} }
tokenStream.close();

tokenStream = analyzer.tokenStream("text", "5月初有哪些电影值得观看"); tokenStream = analyzer.tokenStream("text", "5月初有哪些电影值得观看");
tokenStream.reset();
while(tokenStream.incrementToken()){ while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class); OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
Expand All @@ -91,5 +98,6 @@ public static void main(String args[]) throws IOException {
LOGGER.info("Synonym:"+synonymAttribute.toString()); LOGGER.info("Synonym:"+synonymAttribute.toString());
LOGGER.info("Antonym:"+antonymAttribute.toString()); LOGGER.info("Antonym:"+antonymAttribute.toString());
} }
tokenStream.close();
} }
} }
15 changes: 6 additions & 9 deletions src/main/java/org/apdplat/word/lucene/ChineseWordTokenizer.java
Expand Up @@ -22,15 +22,13 @@


import java.io.BufferedReader; import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.util.Arrays; import java.util.Arrays;
import java.util.Queue; import java.util.Queue;
import java.util.concurrent.LinkedTransferQueue; import java.util.concurrent.LinkedTransferQueue;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttributeImpl;
import org.apdplat.word.lucene.attribute.*; import org.apdplat.word.lucene.attribute.*;
import org.apdplat.word.segmentation.Segmentation; import org.apdplat.word.segmentation.Segmentation;
import org.apdplat.word.segmentation.SegmentationAlgorithm; import org.apdplat.word.segmentation.SegmentationAlgorithm;
Expand Down Expand Up @@ -66,19 +64,18 @@ public class ChineseWordTokenizer extends Tokenizer {
private final Queue<Word> words = new LinkedTransferQueue<>(); private final Queue<Word> words = new LinkedTransferQueue<>();
private int startOffset=0; private int startOffset=0;


public ChineseWordTokenizer(Reader input) { public ChineseWordTokenizer() {
super(input);
segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching); segmentation = SegmentationFactory.getSegmentation(SegmentationAlgorithm.BidirectionalMaximumMatching);
reader = new BufferedReader(input); }
} public ChineseWordTokenizer(Segmentation segmentation) {
public ChineseWordTokenizer(Reader input, Segmentation segmentation) {
super(input);
this.segmentation = segmentation; this.segmentation = segmentation;
reader = new BufferedReader(input);
} }
private Word getWord() throws IOException { private Word getWord() throws IOException {
Word word = words.poll(); Word word = words.poll();
if(word == null){ if(word == null){
if(reader==null){
reader = new BufferedReader(input);
}
String line; String line;
while( (line = reader.readLine()) != null ){ while( (line = reader.readLine()) != null ){
words.addAll(segmentation.seg(line)); words.addAll(segmentation.seg(line));
Expand Down
Expand Up @@ -20,7 +20,9 @@


package org.apdplat.word.solr; package org.apdplat.word.solr;


import java.io.BufferedReader;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
Expand Down Expand Up @@ -73,7 +75,7 @@ public ChineseWordTokenizerFactory(Map<String, String> args){
} }
} }
@Override @Override
public Tokenizer create(AttributeFactory af, Reader reader) { public Tokenizer create(AttributeFactory af) {
return new ChineseWordTokenizer(reader, segmentation); return new ChineseWordTokenizer(segmentation);
} }
} }
16 changes: 10 additions & 6 deletions src/test/java/org/apdplat/word/lucene/ChineseWordAnalyzerTest.java
Expand Up @@ -39,7 +39,6 @@
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory; import org.apache.lucene.store.Directory;
import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apdplat.word.util.Utils; import org.apdplat.word.util.Utils;
import static org.junit.Assert.*; import static org.junit.Assert.*;
import org.junit.Test; import org.junit.Test;
Expand All @@ -55,11 +54,13 @@ public void test1() {
Analyzer analyzer = new ChineseWordAnalyzer(); Analyzer analyzer = new ChineseWordAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者"); TokenStream tokenStream = analyzer.tokenStream("text", "杨尚川是APDPlat应用级产品开发平台的作者");
List<String> words = new ArrayList<>(); List<String> words = new ArrayList<>();
tokenStream.reset();
while(tokenStream.incrementToken()){ while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
words.add(charTermAttribute.toString()); words.add(charTermAttribute.toString());
} }
String expResult = "[杨尚川, apdplat, 应用级, 产品开发, 平台, 作者]"; tokenStream.close();
String expResult = "[杨尚川, apdplat, 应用级, 产品, 开发平台, 作者]";
assertEquals(expResult, words.toString()); assertEquals(expResult, words.toString());
}catch(IOException e){ }catch(IOException e){
fail("分词出错"+e.getMessage()); fail("分词出错"+e.getMessage());
Expand All @@ -71,10 +72,12 @@ public void test2() {
Analyzer analyzer = new ChineseWordAnalyzer(); Analyzer analyzer = new ChineseWordAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我"); TokenStream tokenStream = analyzer.tokenStream("text", "叔叔亲了我妈妈也亲了我");
List<String> words = new ArrayList<>(); List<String> words = new ArrayList<>();
tokenStream.reset();
while(tokenStream.incrementToken()){ while(tokenStream.incrementToken()){
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class); CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
words.add(charTermAttribute.toString()); words.add(charTermAttribute.toString());
} }
tokenStream.close();
String expResult = "[叔叔, 亲了, 妈妈, 亲了]"; String expResult = "[叔叔, 亲了, 妈妈, 亲了]";
assertEquals(expResult, words.toString()); assertEquals(expResult, words.toString());
}catch(IOException e){ }catch(IOException e){
Expand Down Expand Up @@ -118,7 +121,7 @@ public void test3() {
sentences.add("反映了一个人的精神面貌"); sentences.add("反映了一个人的精神面貌");
sentences.add("美国加州大学的科学家发现"); sentences.add("美国加州大学的科学家发现");
sentences.add("我好不挺好"); sentences.add("我好不挺好");
sentences.add("木有"); sentences.add("木有");
sentences.add("下雨天留客天天留我不留"); sentences.add("下雨天留客天天留我不留");
sentences.add("叔叔亲了我妈妈也亲了我"); sentences.add("叔叔亲了我妈妈也亲了我");
sentences.add("白马非马"); sentences.add("白马非马");
Expand All @@ -127,11 +130,11 @@ public void test3() {
sentences.add("张掖市明乐县"); sentences.add("张掖市明乐县");
sentences.add("中华人民共和国万岁万岁万万岁"); sentences.add("中华人民共和国万岁万岁万万岁");
sentences.add("word是一个中文分词项目,作者是杨尚川,杨尚川的英文名叫ysc"); sentences.add("word是一个中文分词项目,作者是杨尚川,杨尚川的英文名叫ysc");
IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, analyzer); IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setUseCompoundFile(false); config.setUseCompoundFile(false);
File index = new File("target/indexes"); File index = new File("target/indexes");
Utils.deleteDir(index); Utils.deleteDir(index);
try (Directory directory = new SimpleFSDirectory(index); try (Directory directory = new SimpleFSDirectory(index.toPath());
IndexWriter indexWriter = new IndexWriter(directory, config)) { IndexWriter indexWriter = new IndexWriter(directory, config)) {
for(String sentence : sentences){ for(String sentence : sentences){
Document doc = new Document(); Document doc = new Document();
Expand All @@ -141,9 +144,10 @@ public void test3() {
} }
indexWriter.commit(); indexWriter.commit();
} catch(Exception e){ } catch(Exception e){
e.printStackTrace();
fail("索引失败"+e.getMessage()); fail("索引失败"+e.getMessage());
} }
try (Directory directory = new SimpleFSDirectory(index); try (Directory directory = new SimpleFSDirectory(index.toPath());
DirectoryReader directoryReader = DirectoryReader.open(directory)) { DirectoryReader directoryReader = DirectoryReader.open(directory)) {
IndexSearcher indexSearcher = new IndexSearcher(directoryReader); IndexSearcher indexSearcher = new IndexSearcher(directoryReader);
QueryParser queryParser = new QueryParser("text", analyzer); QueryParser queryParser = new QueryParser("text", analyzer);
Expand Down

0 comments on commit 50ace13

Please sign in to comment.