Skip to content

Commit

Permalink
Re-add old deconjugator (option can live toggle)
Browse files Browse the repository at this point in the history
  • Loading branch information
wareya committed Mar 23, 2017
1 parent 1f0b7f4 commit ff106ec
Show file tree
Hide file tree
Showing 7 changed files with 630 additions and 306 deletions.
2 changes: 0 additions & 2 deletions src/language/deconjugator/ValidWord.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,4 @@ public String toString()
temp = temp.replaceAll("[(].*?[)]", "");
return word + "―" + temp;
}


}
318 changes: 39 additions & 279 deletions src/language/deconjugator/WordScanner.java

Large diffs are not rendered by default.

305 changes: 305 additions & 0 deletions src/language/deconjugator/WordScannerNew.java

Large diffs are not rendered by default.

262 changes: 262 additions & 0 deletions src/language/deconjugator/WordScannerOld.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
/*
* Copyright (C) 2017 Laurens Weyn
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package language.deconjugator;

import language.dictionary.DefTag;
import language.dictionary.Japanese;

import java.util.ArrayList;

/**
* Created by wareya on 2017/03/23.
*/
public class WordScannerOld extends WordScanner implements SubScanner
{
public void subinit()
{
if(ruleList != null)return;
ruleList = new ArrayList<>();


//Hiragana->Katakana: words are often written in katakana for emphasis but won't be found in EDICT in that form
ruleList.add(word ->
{
String hiragana = Japanese.toHiragana(word.getWord(), false);
if(!word.getWord().equals(hiragana))
{
return new ValidWord(hiragana, "hiragana");
}
else return null;
});


//Decensor: simple, but actually works well enough with a lot of 'censored' words
ruleList.add(l_word ->
{
if(l_word.getWord().contains("○"))
{
return new ValidWord(l_word.getWord().replace('○', 'っ'), (l_word.getProcess() + " " + "censored").trim());
}
else return null;
});
ruleList.add(l_word ->
{
if(l_word.getWord().contains("○"))
{
return new ValidWord(l_word.getWord().replace('○', 'ん'), (l_word.getProcess() + " " + "censored").trim());
}
else return null;
});

//entire Japanese deconjugation lookup table
//see http://www.wikiwand.com/en/Japanese_verb_conjugation

//conditional/past conditional (ra) (+ba for formal) (adds on to past, must come before)
ruleList.add(new StdRule("ったら", "った", "conditional", DefTag.v5u));
ruleList.add(new StdRule("ったらば", "った", "conditional-formal", DefTag.v5u));
ruleList.add(new StdRule("いたら", "いた", "conditional", DefTag.v5k));
ruleList.add(new StdRule("いたらば", "いた", "conditional-formal", DefTag.v5k));
ruleList.add(new StdRule("いだら", "いだ", "conditional", DefTag.v5g));
ruleList.add(new StdRule("いだらば", "いだ", "conditional-formal", DefTag.v5g));
ruleList.add(new StdRule("したら", "した", "conditional", DefTag.v5s));
ruleList.add(new StdRule("したらば", "した", "conditional-formal", DefTag.v5s));
ruleList.add(new StdRule("ったら", "った", "conditional", DefTag.v5t));
ruleList.add(new StdRule("ったらば", "った", "conditional-formal", DefTag.v5t));
ruleList.add(new StdRule("んだら", "んだ", "conditional", DefTag.v5b));
ruleList.add(new StdRule("んだらば", "んだ", "conditional-formal", DefTag.v5b));
ruleList.add(new StdRule("んだら", "んだ", "conditional", DefTag.v5n));
ruleList.add(new StdRule("んだらば", "んだ", "conditional-formal", DefTag.v5n));
ruleList.add(new StdRule("んだら", "んだ", "conditional", DefTag.v5m));
ruleList.add(new StdRule("んだらば", "んだ", "conditional-formal", DefTag.v5m));
ruleList.add(new StdRule("ったら", "った", "conditional", DefTag.v5r));
ruleList.add(new StdRule("ったらば", "った", "conditional-formal", DefTag.v5r));

ruleList.add(new StdRule("かったら", "かった", "conditional", DefTag.adj_i));//TODO does this work with i adjectives?
ruleList.add(new StdRule("かったらば", "かった", "conditional-formal", DefTag.adj_i));//TODO does this work with i adjectives?
ruleList.add(new StdRule("たら", "た", "conditional", DefTag.v1));
ruleList.add(new StdRule("たらば", "た", "conditional-formal", DefTag.v1));

//potential (can do verb, can combine with past)
//further conjugates like v1
ruleList.add(new StdRule("える", "う", "potential", DefTag.v5u, DefTag.v1));
ruleList.add(new StdRule("ける", "く", "potential", DefTag.v5k, DefTag.v1));
ruleList.add(new StdRule("げる", "ぐ", "potential", DefTag.v5g, DefTag.v1));
ruleList.add(new StdRule("せる", "す", "potential", DefTag.v5s, DefTag.v1));
ruleList.add(new StdRule("てる", "つ", "potential", DefTag.v5t, DefTag.v1));
ruleList.add(new StdRule("べる", "ぶ", "potential", DefTag.v5b, DefTag.v1));
ruleList.add(new StdRule("ねる", "ぬ", "potential", DefTag.v5n, DefTag.v1));
ruleList.add(new StdRule("める", "む", "potential", DefTag.v5m, DefTag.v1));
ruleList.add(new StdRule("れる", "る", "potential", DefTag.v5r, DefTag.v1));

//past->dict
ruleList.add(new StdRule("った", "う", "past", DefTag.v5u));
ruleList.add(new StdRule("いた", "く", "past", DefTag.v5k));
ruleList.add(new StdRule("いだ", "ぐ", "past", DefTag.v5g));
ruleList.add(new StdRule("した", "す", "past", DefTag.v5s));
ruleList.add(new StdRule("った", "つ", "past", DefTag.v5t));
ruleList.add(new StdRule("んだ", "ぶ", "past", DefTag.v5b));
ruleList.add(new StdRule("んだ", "ぬ", "past", DefTag.v5n));
ruleList.add(new StdRule("んだ", "む", "past", DefTag.v5m));
ruleList.add(new StdRule("った", "る", "past", DefTag.v5r));

ruleList.add(new StdRule("かった", "い", "past", DefTag.adj_i));
ruleList.add(new StdRule("た", "る", "past", DefTag.v1));

//te->dict
ruleList.add(new StdRule("って", "う", "て form", DefTag.v5u));
ruleList.add(new StdRule("いて", "く", "て form", DefTag.v5k));
ruleList.add(new StdRule("いで", "ぐ", "て form", DefTag.v5g));
ruleList.add(new StdRule("して", "す", "て form", DefTag.v5s));
ruleList.add(new StdRule("って", "つ", "て form", DefTag.v5t));
ruleList.add(new StdRule("んで", "ぶ", "て form", DefTag.v5b));
ruleList.add(new StdRule("んで", "ぬ", "て form", DefTag.v5n));
ruleList.add(new StdRule("んで", "む", "て form", DefTag.v5m));
ruleList.add(new StdRule("って", "る", "て form", DefTag.v5r));

ruleList.add(new StdRule("くて", "い", "て form", DefTag.adj_i));
ruleList.add(new StdRule("て", "る", "て form", DefTag.v1));

//neg->dict (te form must be done first)
//further conjugates like i adjective
ruleList.add(new StdRule("わない", "う", "negative", DefTag.v5u, DefTag.adj_i));
ruleList.add(new StdRule("かない", "く", "negative", DefTag.v5k, DefTag.adj_i));
ruleList.add(new StdRule("がない", "ぐ", "negative", DefTag.v5g, DefTag.adj_i));
ruleList.add(new StdRule("さない", "す", "negative", DefTag.v5s, DefTag.adj_i));
ruleList.add(new StdRule("たない", "つ", "negative", DefTag.v5t, DefTag.adj_i));
ruleList.add(new StdRule("ばない", "ぶ", "negative", DefTag.v5b, DefTag.adj_i));
ruleList.add(new StdRule("なない", "ぬ", "negative", DefTag.v5n, DefTag.adj_i));
ruleList.add(new StdRule("まない", "む", "negative", DefTag.v5m, DefTag.adj_i));
ruleList.add(new StdRule("らない", "る", "negative", DefTag.v5r, DefTag.adj_i));

ruleList.add(new StdRule("くない", "い", "negative", DefTag.adj_i));
ruleList.add(new StdRule("ない", "る", "negative", DefTag.v1, DefTag.adj_i));

//masu/tai/etc removal (handled after past so that still conjugates, before i stem)
//TODO make these only work with verbs (not with gatai!)
ruleList.add(new StdRule("ます", "", "polite"));
ruleList.add(new StdRule("ません", "", "negative polite"));
ruleList.add(new StdRule("たい", "", "want"));
ruleList.add(new StdRule("なさい", "", "command"));

//i stem (polite/tai/etc)
ruleList.add(new StdRule("い", "う", "i stem", DefTag.v5u));
ruleList.add(new StdRule("き", "く", "i stem", DefTag.v5k));
ruleList.add(new StdRule("ぎ", "ぐ", "i stem", DefTag.v5g));
ruleList.add(new StdRule("し", "す", "I stem", DefTag.v5s));//note: capital I to stop removal of v5s tag
ruleList.add(new StdRule("ち", "つ", "i stem", DefTag.v5t));
ruleList.add(new StdRule("に", "ぬ", "i stem", DefTag.v5n));
ruleList.add(new StdRule("び", "ぶ", "i stem", DefTag.v5b));
ruleList.add(new StdRule("み", "む", "i stem", DefTag.v5m));
ruleList.add(new StdRule("り", "る", "i stem", DefTag.v5r));

ruleList.add(new StdRule("", "る", "i stem", DefTag.v1));
//adjective stems moved to bottom to avoid conflict with provisional-conditional

//adjective conjugations
ruleList.add(new StdRule("く", "い", "adverb", DefTag.adj_i));
ruleList.add(new StdRule("な", "", "adjective", DefTag.adj_na));
ruleList.add(new StdRule("の", "", "adjective", DefTag.adj_no));

//potential was here

//not for adjectives
ruleList.add(new StdRule("られる", "る", "potential", DefTag.v1));//normal
ruleList.add(new StdRule("らる", "る", "potential", DefTag.v1));//coloquial

//passive
//further conjugates like v1
ruleList.add(new StdRule("われる", "う", "passive", DefTag.v5u, DefTag.v1));
ruleList.add(new StdRule("かれる", "く", "passive", DefTag.v5k, DefTag.v1));
ruleList.add(new StdRule("がれる", "ぐ", "passive", DefTag.v5g, DefTag.v1));
ruleList.add(new StdRule("される", "す", "passive", DefTag.v5s, DefTag.v1));
ruleList.add(new StdRule("たてる", "つ", "passive", DefTag.v5t, DefTag.v1));
ruleList.add(new StdRule("ばれる", "ぶ", "passive", DefTag.v5b, DefTag.v1));
ruleList.add(new StdRule("なれる", "ぬ", "passive", DefTag.v5n, DefTag.v1));
ruleList.add(new StdRule("まれる", "む", "passive", DefTag.v5m, DefTag.v1));
ruleList.add(new StdRule("られる", "る", "passive", DefTag.v5r, DefTag.v1));

//not for adjectives
ruleList.add(new StdRule("られる", "る", "passive", DefTag.v1));


//causative

//causative passive (colloquial version, add polite later?)


//-eba form (provisional conditional)
ruleList.add(new StdRule("えば", "う", "provisional-conditional", DefTag.v5u));
ruleList.add(new StdRule("けば", "く", "provisional-conditional", DefTag.v5k));
ruleList.add(new StdRule("げば", "ぐ", "provisional-conditional", DefTag.v5g));
ruleList.add(new StdRule("せば", "す", "provisional-conditional", DefTag.v5s));
ruleList.add(new StdRule("てば", "つ", "provisional-conditional", DefTag.v5t));
ruleList.add(new StdRule("べば", "ぶ", "provisional-conditional", DefTag.v5b));
ruleList.add(new StdRule("ねば", "ぬ", "provisional-conditional", DefTag.v5n));
ruleList.add(new StdRule("めば", "む", "provisional-conditional", DefTag.v5m));
ruleList.add(new StdRule("れば", "る", "provisional-conditional", DefTag.v5r));
ruleList.add(new StdRule("れば", "る", "provisional-conditional", DefTag.v5r_i));

ruleList.add(new StdRule("くない", "い", "provisional-conditional", DefTag.adj_i));
ruleList.add(new StdRule("なければ", "ない", "provisional-conditional", DefTag.aux_adj));
ruleList.add(new StdRule("れば", "る", "provisional-conditional", DefTag.v1));


//imperative (for orders)
ruleList.add(new StdRule("え", "う", "imperative", DefTag.v5u));
ruleList.add(new StdRule("け", "く", "imperative", DefTag.v5k));
ruleList.add(new StdRule("げ", "ぐ", "imperative", DefTag.v5g));
ruleList.add(new StdRule("せ", "す", "imperative", DefTag.v5s));
ruleList.add(new StdRule("て", "つ", "imperative", DefTag.v5t));
ruleList.add(new StdRule("べ", "ぶ", "imperative", DefTag.v5b));
ruleList.add(new StdRule("ね", "ぬ", "imperative", DefTag.v5n));
ruleList.add(new StdRule("め", "む", "imperative", DefTag.v5m));
ruleList.add(new StdRule("れ", "る", "imperative", DefTag.v5r));
ruleList.add(new StdRule("れ", "る", "imperative", DefTag.v5r_i));

//not for i-adj, 4 exist for v1
ruleList.add(new StdRule("いろ", "いる", "imperative", DefTag.v1));
ruleList.add(new StdRule("いよ", "いる", "imperative", DefTag.v1));
ruleList.add(new StdRule("えろ", "える", "imperative", DefTag.v1));
ruleList.add(new StdRule("えよ", "える", "imperative", DefTag.v1));

//volitional (let's)
ruleList.add(new StdRule("おう", "う", "volitional", DefTag.v5u));
ruleList.add(new StdRule("こう", "く", "volitional", DefTag.v5k));
ruleList.add(new StdRule("ごう", "ぐ", "volitional", DefTag.v5g));
ruleList.add(new StdRule("そう", "す", "volitional", DefTag.v5s));
ruleList.add(new StdRule("とう", "つ", "volitional", DefTag.v5t));
ruleList.add(new StdRule("ぼう", "ぶ", "volitional", DefTag.v5b));
ruleList.add(new StdRule("のう", "ぬ", "volitional", DefTag.v5n));
ruleList.add(new StdRule("もう", "む", "volitional", DefTag.v5m));
ruleList.add(new StdRule("ろう", "る", "volitional", DefTag.v5r));

ruleList.add(new StdRule("かろう", "い", "volitional", DefTag.adj_i));
ruleList.add(new StdRule("よう", "る", "volitional", DefTag.v1));


ruleList.add(new StdRule("", "い", "adj 'stem'", DefTag.adj_i));//Try match stem anyways, needed for things like '頼もしげに'
//no stem for adjectives, but -sou sort-of uses a stem
ruleList.add(new StdRule("そう", "い", "-sou", DefTag.adj_i));
}
// nasty subroutine: make functional? how much overhead does passing data structures have in java?
public void ScanWord(String word)
{
matches.add(new ValidWord(word, ""));//add initial unmodified word
test_rules(0);
}
}
47 changes: 22 additions & 25 deletions src/language/splitter/WordSplitter.java
Original file line number Diff line number Diff line change
Expand Up @@ -58,42 +58,40 @@ private List<FoundWord> splitSection(String text, boolean firstSection)

// select the initial "overly long and certainly bogus" segment for deconjugation

/*
// look for the longest segment covered as-is in the dictionary
while(pos > start)
if(!options.getOption("automaticallyParse").equals("none")) // (unless parsing is disabled)
{
String string_at = text.substring(start, pos);
if(dict.find(string_at) == null && !dict.hasEpwingDef(string_at))
// look for the longest segment covered as-is in the dictionary
while(pos > start)
{
// not in dictionary, see if adding possible deconjugation match endings to it gives us a dictionary entry (fixes 振り返ります etc)
string_at = string_at.substring(0, string_at.length()-1);
boolean good_match = false;
for(String ending:WordScanner.possibleEndings())
{
String attempt = string_at+ending;
if(dict.find(attempt) != null || dict.hasEpwingDef(attempt))
good_match = true;
}
if(!good_match)
String string_at = text.substring(start, pos);
if(dict.find(string_at) == null && !dict.hasEpwingDef(string_at))
{
pos--;
continue; // don't fall through to "break;"
// not in dictionary, see if adding possible deconjugation match endings to it gives us a dictionary entry (fixes 振り返ります etc)
string_at = string_at.substring(0, string_at.length()-1);
boolean good_match = false;
for(String ending:WordScanner.possibleEndings())
{
String attempt = string_at+ending;
if(dict.find(attempt) != null || dict.hasEpwingDef(attempt))
good_match = true;
}
if(!good_match)
{
pos--;
continue; // don't fall through to "break;"
}
}
break;
}
break;
}
// extend it until extending it picks up things other than just kana
if(!options.getOption("automaticallyParse").equals("none")) // (unless parsing is disabled)
{
// extend it until it's about to pick up characters that aren't acceptable in conjugations
while(pos < text.length())
{
if(isKana(text.charAt(pos))) // character past the end of substr start...pos
if(WordScanner.isAcceptableCharacter(text.charAt(pos))) // character past the end of substr start...pos
pos++;
else
break;
}
}
*/
FoundWord matchedWord = null;
//until we've tried all lengths and failed
while(pos > start)
Expand All @@ -115,7 +113,6 @@ private List<FoundWord> splitSection(String text, boolean firstSection)
if(matchedWord.getDefinitionCount() == 0 && options.getOption("automaticallyParse").equals("full")) // (only if full parsing is enabled)
{
matchedWord = null;
//System.out.println("--->Trying a shorter segment");
pos--;//try shorter word
}
else//found a word
Expand Down
1 change: 1 addition & 0 deletions src/options/Options.java
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ public Options()
options.put("screenshotExportPath", "screenshots/");
options.put("fullscreenScreenshot", "false");
options.put("automaticallyParse", "true");
options.put("useOldParser", "false");

}
public Options(File file)throws IOException
Expand Down

0 comments on commit ff106ec

Please sign in to comment.