Skip to content

Commit

Permalink
优先使用一次性加载词典的方法
Browse files Browse the repository at this point in the history
  • Loading branch information
ysc committed May 7, 2015
1 parent 44a67e5 commit 22deaef
Showing 1 changed file with 50 additions and 45 deletions.
95 changes: 50 additions & 45 deletions src/main/java/org/apdplat/word/dictionary/DictionaryFactory.java
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@


import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;


import org.apdplat.word.dictionary.impl.DictionaryTrie; import org.apdplat.word.dictionary.impl.DictionaryTrie;
import org.apdplat.word.recognition.PersonName; import org.apdplat.word.recognition.PersonName;
Expand Down Expand Up @@ -72,7 +73,7 @@ private static Dictionary constructDictionary(){
reload(); reload();
} }
public static void reload(){ public static void reload(){
AutoDetector.loadAndWatch(new ResourceLoader(){ AutoDetector.loadAndWatch(new ResourceLoader() {


@Override @Override
public void clear() { public void clear() {
Expand All @@ -82,32 +83,51 @@ public void clear() {
@Override @Override
public void load(List<String> lines) { public void load(List<String> lines) {
LOGGER.info("初始化词典"); LOGGER.info("初始化词典");
int count=0; int count = 0;
for(String surname : PersonName.getSurnames()){ for (String surname : PersonName.getSurnames()) {
if(surname.length() == 2){ if (surname.length() == 2) {
count++; count++;
lines.add(surname); lines.add(surname);
} }
} }
LOGGER.info("将 "+count+" 个复姓加入词典"); LOGGER.info("将 " + count + " 个复姓加入词典");
Map<Integer, AtomicInteger> map = new HashMap<>(); List<String> words = getAllWords(lines);
for(String line : lines){ //构造词典
getWords(line).forEach(word -> { DIC.addAll(words);
//加入词典 //输出统计信息
DIC.add(word); showStatistics(words);
//统计不同长度的词的数目 if (DIC instanceof DictionaryTrie) {
map.putIfAbsent(word.length(), new AtomicInteger()); DictionaryTrie dictionaryTrie = (DictionaryTrie) DIC;
map.get(word.length()).incrementAndGet();
});
}
showStatistics(map);
if(DIC instanceof DictionaryTrie){
DictionaryTrie dictionaryTrie = (DictionaryTrie)DIC;
dictionaryTrie.showConflict(); dictionaryTrie.showConflict();
} }
LOGGER.info("词典初始化完毕"); LOGGER.info("词典初始化完毕");
} }


private void showStatistics(List<String> words) {
Map<Integer, AtomicInteger> map = new HashMap<Integer, AtomicInteger>();
words.forEach(word->{
map.putIfAbsent(word.length(), new AtomicInteger());
map.get(word.length()).incrementAndGet();
});
//统计词数
int wordCount=0;
//统计平均词长
int totalLength=0;
for(int len : map.keySet()){
totalLength += len * map.get(len).get();
wordCount += map.get(len).get();
}
LOGGER.info("词数目:" + wordCount + ",词典最大词长:" + DIC.getMaxLength());
for(int len : map.keySet()){
if(len<10){
LOGGER.info("词长 "+len+" 的词数为:"+map.get(len));
}else{
LOGGER.info("词长 "+len+" 的词数为:"+map.get(len));
}
}
LOGGER.info("词典平均词长:" + (float) totalLength / wordCount);
}

@Override @Override
public void add(String line) { public void add(String line) {
//加入词典 //加入词典
Expand All @@ -120,11 +140,15 @@ public void remove(String line) {
getWords(line).forEach(DIC::remove); getWords(line).forEach(DIC::remove);
} }


private List<String> getWords(String line){ private List<String> getAllWords(List<String> lines) {
return lines.stream().flatMap(line -> getWords(line).stream()).collect(Collectors.toSet()).stream().collect(Collectors.toList());
}

private List<String> getWords(String line) {
List<String> words = new ArrayList<>(); List<String> words = new ArrayList<>();
//一行以空格分隔可以放多个词 //一行以空格分隔可以放多个词
for(String word : line.split("\\s+")) { for (String word : line.split("\\s+")) {
if(word.length()==1){ if (word.length() == 1) {
System.out.println(word); System.out.println(word);
} }
//处理词性词典 //处理词性词典
Expand All @@ -139,29 +163,10 @@ private List<String> getWords(String line){
return words; return words;
} }
}, WordConfTools.get("dic.path", "classpath:dic.txt") }, WordConfTools.get("dic.path", "classpath:dic.txt")
+","+WordConfTools.get("punctuation.path", "classpath:punctuation.txt") + "," + WordConfTools.get("punctuation.path", "classpath:punctuation.txt")
+","+WordConfTools.get("part.of.speech.dic.path", "classpath:part_of_speech_dic.txt") + "," + WordConfTools.get("part.of.speech.dic.path", "classpath:part_of_speech_dic.txt")
+","+WordConfTools.get("word.synonym.path", "classpath:word_synonym.txt") + "," + WordConfTools.get("word.synonym.path", "classpath:word_synonym.txt")
+","+WordConfTools.get("word.antonym.path", "classpath:word_antonym.txt")); + "," + WordConfTools.get("word.antonym.path", "classpath:word_antonym.txt"));
}
private static void showStatistics(Map<Integer, AtomicInteger> map) {
//统计词数
int wordCount=0;
//统计平均词长
int totalLength=0;
for(int len : map.keySet()){
totalLength += len * map.get(len).get();
wordCount += map.get(len).get();
}
LOGGER.info("词数目:"+wordCount+",词典最大词长:"+DIC.getMaxLength());
for(int len : map.keySet()){
if(len<10){
LOGGER.info("词长 "+len+" 的词数为:"+map.get(len));
}else{
LOGGER.info("词长 "+len+" 的词数为:"+map.get(len));
}
}
LOGGER.info("词典平均词长:"+(float)totalLength/wordCount);
} }
} }
} }

0 comments on commit 22deaef

Please sign in to comment.