https://lucene.apache.org/core/8_7_0/core/overview-summary.html#overview.description

https://lucene.apache.org/core/8_7_0/core/org/apache/lucene/analysis/package-summary.html#package.description

In [1]:
@file:DependsOn("org.apache.lucene:lucene-core:8.7.0")
@file:DependsOn("org.apache.lucene:lucene-queryparser:8.7.0")
@file:DependsOn("org.apache.lucene:lucene-analyzers-common:8.7.0")
@file:DependsOn("com.jianggujin:IKAnalyzer-lucene:8.0.0")
import java.io.*
import java.nio.file.*
import org.apache.lucene.analysis.standard.*
import org.apache.lucene.analysis.core.*
import org.apache.lucene.store.*
import org.apache.lucene.index.*
import org.apache.lucene.document.*
import org.apache.lucene.search.*
import org.apache.lucene.util.*
import org.apache.lucene.analysis.tokenattributes.*
import org.apache.lucene.queryparser.classic.*
import org.wltea.analyzer.lucene.*

In [50]:
val analyzer = StandardAnalyzer();

val path = Path.of("data/lucene/temp")
val indexPath = Files.createDirectory(path)
val directory = FSDirectory.open(indexPath)
val config = IndexWriterConfig(analyzer);
val iwriter = IndexWriter(directory, config);
val doc = Document();
val text = "This is the text to be indexed.";
doc.add(Field("fieldname", text, TextField.TYPE_STORED));
iwriter.addDocument(doc);
iwriter.close();

// Now search the index:
val ireader = DirectoryReader.open(directory);
val isearcher = IndexSearcher(ireader);
// Parse a simple query that searches for "text":
val parser = QueryParser("fieldname", analyzer);
val query = parser.parse("text");
println(query.toString())
val hits = isearcher.search(query, 10).scoreDocs;
println(hits.size);
// Iterate through the results:
for (i in 0..hits.size-1) {
  val hitDoc = isearcher.doc(hits[i].doc);
  println(hitDoc.get("fieldname"))
}
ireader.close();
directory.close();
IOUtils.rm(indexPath);

fieldname:text
1
This is the text to be indexed.


In [52]:
val analyzer = StandardAnalyzer();
val ts = analyzer.tokenStream("myfield", StringReader("今天是高兴的一天"));

 try {
   ts.reset();
   while (ts.incrementToken()) {
     println("token: " + ts.reflectAsString(false));
   }
   ts.end();
 } finally {
   ts.close();
 }
 
 analyzer.close()

token: term=今,bytes=[e4 bb 8a],startOffset=0,endOffset=1,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=天,bytes=[e5 a4 a9],startOffset=1,endOffset=2,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=是,bytes=[e6 98 af],startOffset=2,endOffset=3,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=高,bytes=[e9 ab 98],startOffset=3,endOffset=4,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=兴,bytes=[e5 85 b4],startOffset=4,endOffset=5,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=的,bytes=[e7 9a 84],startOffset=5,endOffset=6,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=一,bytes=[e4 b8 80],startOffset=6,endOffset=7,positionIncrement=1,positionLength=1,type=<IDEOGRAPHIC>,termFrequency=1
token: term=天,bytes=[e5 a4 a9],startOffset=7,endOffset=8,positionIncrement=1,positionLength=1,typ

In [2]:
val analyzer = KeywordAnalyzer()
val ts = analyzer.tokenStream("myfield", StringReader("今天是高兴的一天"));

 try {
   ts.reset();
   while (ts.incrementToken()) {
     println("token: " + ts.reflectAsString(false));
   }
   ts.end();
 } finally {
   ts.close();
 }
 
 analyzer.close()

token: term=今天是高兴的一天,bytes=[e4 bb 8a e5 a4 a9 e6 98 af e9 ab 98 e5 85 b4 e7 9a 84 e4 b8 80 e5 a4 a9],startOffset=0,endOffset=8,positionIncrement=1,positionLength=1,type=word,termFrequency=1


https://github.com/wks/ik-analyzer

In [7]:
val analyzer = IKAnalyzer();
val ts = analyzer.tokenStream("myfield", StringReader("今天是高兴的一天"));

 try {
   ts.reset();
   while (ts.incrementToken()) {
     println("token: " + ts.reflectAsString(false));
   }
   ts.end();
 } finally {
   ts.close();
 }
 
 analyzer.close()

token: term=今天是,bytes=[e4 bb 8a e5 a4 a9 e6 98 af],startOffset=0,endOffset=3,positionIncrement=1,positionLength=1,type=CN_WORD,termFrequency=1
token: term=今天,bytes=[e4 bb 8a e5 a4 a9],startOffset=0,endOffset=2,positionIncrement=1,positionLength=1,type=CN_WORD,termFrequency=1
token: term=高兴,bytes=[e9 ab 98 e5 85 b4],startOffset=3,endOffset=5,positionIncrement=1,positionLength=1,type=CN_WORD,termFrequency=1
token: term=一天,bytes=[e4 b8 80 e5 a4 a9],startOffset=6,endOffset=8,positionIncrement=1,positionLength=1,type=CN_WORD,termFrequency=1
token: term=一,bytes=[e4 b8 80],startOffset=6,endOffset=7,positionIncrement=1,positionLength=1,type=TYPE_CNUM,termFrequency=1
token: term=天,bytes=[e5 a4 a9],startOffset=7,endOffset=8,positionIncrement=1,positionLength=1,type=COUNT,termFrequency=1
