Skip to content

Commit

Permalink
相同版本的韩语象形分词第一版初步完成,未优化预料库.,这个工程有力论证, Deta 类人催化分词适用于所有象形语言. 2019-04-21
Browse files Browse the repository at this point in the history
  • Loading branch information
yaoguangluo committed Apr 20, 2019
1 parent 0ce6401 commit 95081d2
Show file tree
Hide file tree
Showing 7 changed files with 14,006 additions and 13,799 deletions.
1 change: 1 addition & 0 deletions wordSegment/org/tinos/engine/analysis/Analyzer.java
Expand Up @@ -5,6 +5,7 @@
import org.tinos.view.obj.WordFrequency;
public interface Analyzer {
void init() throws IOException;
void initMixed() throws IOException;
List<String> parserString(String input);
void addFixWords(int charPosition, String inputString,StringBuilder[] fixWords);
Map<String, WordFrequency> getWordFrequencyMap(List<String> sets) throws IOException;
Expand Down
19 changes: 19 additions & 0 deletions wordSegment/org/tinos/engine/analysis/imp/AnalyzerImp.java
Expand Up @@ -49,6 +49,25 @@ public void init() throws IOException {
wordsForests=fHMMList.getWordsForests();
}

public void initMixed() throws IOException {
this.fHMMList=new FMHMMListOneTimeImp();
fHMMList.indexMixed();
fHMMList.indexPosEnToCn();
fHMMList.indexPosEnToEn();
fHMMList.indexEnToCn();
fHMMList.indexCnToEn();
fHMMList.indexFullEnToCn();
fHMMList.indexFullCnToEn();
neroController= new NEROControllerOneTimeImp();
nlpController= new NLPControllerImp();
posController= new POSControllerImp();
quick6DLuoYaoguangSort = new Quick6DLuoYaoguangSortMapImp();
forestRoots=fHMMList.getMap();
forestsRoots=fHMMList.getMaps();
wordsForest=fHMMList.getPosCnToCn();
wordsForests=fHMMList.getWordsForests();
}

public List<String> parserMixedString(String mixedString) {
mixedString += StableData.SPACE_STRING_DISTINCTION;
int inputStringLength = mixedString.length();
Expand Down
1 change: 1 addition & 0 deletions wordSegment/org/tinos/ortho/fhmm/FHMMList.java
Expand Up @@ -6,6 +6,7 @@
import java.util.Map;
public interface FHMMList {
void index() throws IOException;
void indexMixed() throws IOException;
void indexPosEnToCn() throws IOException;
void indexPosEnToEn() throws IOException;
void indexEnToCn() throws IOException;
Expand Down
6 changes: 6 additions & 0 deletions wordSegment/org/tinos/ortho/fhmm/imp/FMHMMListImp.java
Expand Up @@ -264,4 +264,10 @@ public Map<Long, FMHMMNode>[] getMaps() {
public Map<Long, Map<String, String>> getWordsForests() {
return null;
}

@Override
public void indexMixed() throws IOException {
// TODO Auto-generated method stub

}
}
188 changes: 188 additions & 0 deletions wordSegment/org/tinos/ortho/fhmm/imp/FMHMMListOneTimeImp.java
Expand Up @@ -72,6 +72,194 @@ public Map<Long, FMHMMNode>[] getMaps() {
return maps;
}

public void indexMixed() throws IOException {
posCnToCn= new HashMap<>();
linkedHashMap= new HashMap<>();
listCn= new CopyOnWriteArrayList<>();
listKo= new CopyOnWriteArrayList<>();
InputStream inputStream= getClass().getResourceAsStream(StableData.WORDS_SOURSE_LINK_POS_CN_TO_CN);
BufferedReader cReader= new BufferedReader(new InputStreamReader(inputStream, StableData.UTF8_STRING));
InputStream inputStreamKorea= getClass().getResourceAsStream(StableData.WORDS_SOURSE_LINK_POS_CN_TO_KO);
BufferedReader cReaderKorea= new BufferedReader(new InputStreamReader(inputStreamKorea, StableData.UTF8_STRING));

String cInputString;
String cInputStringKorea;
Here:
while ((cInputString = cReader.readLine()) != null) {
cInputStringKorea= cReaderKorea.readLine();
listCn.add(cInputString);
if(null!= cInputStringKorea) {
listKo.add(cInputStringKorea);
}
if(!(!cInputString.replace(StableData.SPACE_STRING, StableData.EMPTY_STRING).equals(StableData.EMPTY_STRING)
&& cInputString.split(StableData.NLP_SYMBO_SLASH).length > StableData.INT_ONE )) {
continue Here;
}
if(!StableMaps.fuCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_FU)) {
StableMaps.fuCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.fuCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.dongCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_DONG)) {
StableMaps.dongCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.dongCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.liangCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_LIANG)) {
StableMaps.liangCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.liangCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.lianCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_LIAN)) {
StableMaps.lianCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.lianCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.baDongCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_BA_DONG)) {
StableMaps.baDongCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.baDongCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.xianDingCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_XIAN_DING)) {
StableMaps.xianDingCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.xianDingCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.mingCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_MING)) {
StableMaps.mingCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.mingCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.daiCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_DAI)) {
StableMaps.daiCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.daiCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.jieCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_JIE)) {
StableMaps.jieCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.jieCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.xingRongCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_XING_RONG)) {
StableMaps.xingRongCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.xingRongCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.zhuCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_ZHU)) {
StableMaps.zhuCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.zhuCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.weiCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_WEI)) {
StableMaps.weiCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.weiCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.shengLueCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_SHENG_LUE)) {
StableMaps.shengLueCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.shengLueCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.qingTaiCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_QING_TAI)) {
StableMaps.qingTaiCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.qingTaiCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.xingWeiCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_XING_WEI)) {
StableMaps.xingWeiCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.xingWeiCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.shiTaiCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_SHI_TAI)) {
StableMaps.shiTaiCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.shiTaiCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
if(!StableMaps.dingMingCi.containsKey(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO])
&& cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE].contains(StableData.NLP_CI_DING_MING)) {
StableMaps.dingMingCi.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
if(null!= cInputStringKorea) {
StableMaps.dingMingCi.put(cInputStringKorea.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO],cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);
}
}
posCnToCn.put(cInputString.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ZERO], cInputString
.split(StableData.NLP_SYMBO_SLASH)[StableData.INT_ONE]);

linkedHashMap = loopLoadForest(cInputString);
if(null!= cInputStringKorea) {
linkedHashMap = loopLoadForest(cInputStringKorea);
}
}
cReader.close();
}


public void index() throws IOException {
posCnToCn= new HashMap<>();
linkedHashMap= new HashMap<>();
Expand Down

0 comments on commit 95081d2

Please sign in to comment.