Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Remove statistical conversion code

  • Loading branch information...
commit 48d5cc478200bb354bff879aff34ed174f8874b5 1 parent 5d20a3f
@ueno authored
View
3  README
@@ -16,9 +16,6 @@ Features:
* GObject based API with gobject-introspection support.
-* Experimental support for intelligent kana-to-kanji conversion based
- on Viterbi algorithm.
-
Documentation:
* file:tests/context.c for basic usage
View
1  libskk/Makefile.am
@@ -56,7 +56,6 @@ libskkinclude_HEADERS = libskk.h
libskk_la_SOURCES = \
rom-kana.vala \
- kana-kan.vala \
encoding.vala \
dict.vala \
file-dict.vala \
View
212 libskk/kana-kan.vala
@@ -1,212 +0,0 @@
-// A naive kana-kanji converter based on:
-// http://gihyo.jp/magazine/wdpress/archive/2011/vol64 (Japanese)
-// dictionary and score map generation scripts can be found at:
-// http://gihyo.jp/assets/files/magazine/wdpress/2011/64/WDB64-toku3-kanakan.zip
-// See tests/kana-kan.c for example.
-
-using Gee;
-
-namespace Skk {
- /**
- * Experimental kana-to-kanji converter using Viterbi algorithm.
- */
- public class KanaKanConverter : Object {
- KanaKanDict dict;
- KanaKanScoreMap map;
-
- public KanaKanConverter (KanaKanDict dict, KanaKanScoreMap map) {
- this.dict = dict;
- this.map = map;
- }
-
- public string convert (string kana) {
- var graph = new KanaKanGraph (dict, kana);
- StringBuilder builder = new StringBuilder ();
- string[] words = viterbi (graph, map);
- foreach (var word in words) {
- builder.append (word);
- }
- return builder.str;
- }
-
- static string[] viterbi (KanaKanGraph graph, KanaKanScoreMap map) {
- foreach (var nodes in graph.nodes) {
- foreach (var node in nodes) {
- if (node.is_bos ())
- continue;
- node.score = -1000000.0;
- var node_score = map.get_node_score (node);
- var prev_nodes = graph.get_prev_nodes (node);
- foreach (var prev_node in prev_nodes) {
- var score = prev_node.score + map.get_edge_score (prev_node, node) + node_score;
- if (score >= node.score) {
- node.score = score;
- node.prev = prev_node;
- }
- }
- }
- }
- ArrayList<string> result = new ArrayList<string> ();
- var node = graph.eos.prev;
- while (!node.is_bos ()) {
- result.insert (0, node.word);
- node = node.prev;
- }
- return result.to_array ();
- }
- }
-
- public class KanaKanDict : Object {
- HashMap<string,Set<string>> dict =
- new HashMap<string,Set<string>> ();
- public KanaKanDict (string path) throws GLib.Error, GLib.IOError {
- File file = File.new_for_path (path);
- DataInputStream input = new DataInputStream (file.read ());
- while (true) {
- size_t length;
- string? line = input.read_line (out length);
- if (line == null)
- break;
- string[] a = line.chomp ().split ("\t");
- add (a[0], a[1]);
- }
- }
-
- public void add (string pron, string word) {
- if (!dict.has_key (pron)) {
- dict.set (pron, new HashSet<string> ());
- }
- dict.get (pron).add (word);
- }
-
- internal Set<string> lookup (string pron) {
- if (!dict.has_key (pron)) {
- return new HashSet<string> ();
- }
- return dict.get (pron);
- }
- }
-
- public class KanaKanScoreMap : Object {
- Map<string,double?> map = new HashMap<string,double?> ();
- public KanaKanScoreMap (string path, KanaKanDict dict) throws GLib.Error, GLib.IOError {
- File file = File.new_for_path (path);
- DataInputStream input = new DataInputStream (file.read ());
- while (true) {
- size_t length;
- string? line = input.read_line (out length);
- if (line == null)
- break;
- string[] a = line.chomp ().split ("\t\t");
- map.set (a[0], double.parse (a[1]));
- string[] b = a[0].split ("\t");
- if (b.length == 2 &&
- b[0].has_prefix ("S") &&
- b[1].has_prefix ("R")) {
- var word = b[0].substring (1);
- var pron = b[1].substring (1);
- dict.add (pron, word);
- }
- }
- }
-
- double get_score (string feature) {
- if (map.has_key (feature))
- return map.get (feature);
- return 0.0;
- }
-
- internal double get_node_score (KanaKanNode node) {
- double score = 0.0;
- string feature;
- feature = "S%s\tR%s".printf (node.word, node.pron);
- score += get_score (feature);
- feature = "S%s".printf (node.word);
- score += get_score (feature);
- return score;
- }
-
- internal double get_edge_score (KanaKanNode prev_node, KanaKanNode node) {
- var feature = "S%s\tS%s".printf (prev_node.word, node.word);
- return get_score (feature);
- }
- }
-
- class KanaKanNode : Object {
- internal string word;
- internal string pron;
- internal int endpos;
- internal double score = 0.0;
- internal KanaKanNode? prev = null;
-
- internal KanaKanNode (string word, string pron, int endpos) {
- this.word = word;
- this.pron = pron;
- this.endpos = endpos;
- }
-
- internal int length {
- get {
- return pron.char_count ();
- }
- }
-
- internal bool is_bos () {
- return endpos == 0;
- }
-
- internal bool is_eos () {
- return length == 0 && endpos != 0;
- }
- }
-
- class KanaKanGraph : Object {
- KanaKanDict dict;
- internal ArrayList<KanaKanNode>[] nodes;
- internal KanaKanNode bos;
- internal KanaKanNode eos;
-
- internal KanaKanGraph (KanaKanDict dict, string str) {
- this.dict = dict;
- UnicodeString ustr = new UnicodeString (str);
- nodes = new ArrayList<KanaKanNode>[ustr.length + 2];
- for (int i = 0; i < ustr.length + 2; i++) {
- nodes[i] = new ArrayList<KanaKanNode> ();
- }
-
- bos = new KanaKanNode ("", "", 0);
- nodes[0].add (bos);
-
- eos = new KanaKanNode ("", "", ustr.length + 1);
- nodes[ustr.length + 1].add (eos);
-
- for (int i = 0; i < ustr.length; i++) {
- for (int j = i + 1; j <= int.min (ustr.length, i + 16); j++) {
- var pron = ustr.substring (i, j - i);
- var words = dict.lookup (pron);
- foreach (var word in words) {
- var node = new KanaKanNode (word, pron, j);
- nodes[j].add (node);
- }
- }
- if (i < ustr.length) {
- var pron = ustr.substring (i, 1);
- var node = new KanaKanNode (pron, pron, i + 1);
- nodes[i + 1].add (node);
- }
- }
- }
-
- internal ArrayList<KanaKanNode> get_prev_nodes (KanaKanNode node) {
- if (node.is_eos ()) {
- int startpos = node.endpos - 1;
- return nodes[startpos];
- } else if (node.is_bos ()) {
- return new ArrayList<KanaKanNode> ();
- } else {
- int startpos = node.endpos - node.length;
- return nodes[startpos];
- }
- }
- }
-}
View
5 tests/Makefile.am
@@ -33,9 +33,6 @@ AM_CPPFLAGS = \
rom_kana_SOURCES = rom-kana.c
rom_kana_LDADD = $(top_builddir)/libskk/libskk.la $(LIBSKK_LIBS)
-# kana_kan_SOURCES = kana-kan.c
-# kana_kan_LDADD = $(top_builddir)/libskk/libskk.la $(LIBSKK_LIBS)
-
file_dict_SOURCES = file-dict.c
file_dict_LDADD = $(top_builddir)/libskk/libskk.la $(LIBSKK_LIBS)
@@ -57,7 +54,7 @@ context_LDADD = $(top_builddir)/libskk/libskk.la $(LIBSKK_LIBS)
basic_SOURCES = basic.c common.c
basic_LDADD = $(top_builddir)/libskk/libskk.la $(LIBSKK_LIBS)
-EXTRA_DIST = file-dict.dat cdb-dict.dat kana-kan.c
+EXTRA_DIST = file-dict.dat cdb-dict.dat
CLEANFILES = user-dict.dat valgrind.log.*
-include $(top_srcdir)/git.mk
View
42 tests/kana-kan.c
@@ -1,42 +0,0 @@
-#include <libskk/libskk.h>
-
-static void
-kana_kan (void)
-{
- SkkKanaKanDict *dict;
- SkkKanaKanScoreMap *map;
- SkkKanaKanConverter *converter;
- GError *error;
- gchar *output;
-
- error = NULL;
- dict = skk_kana_kan_dict_new ("juman.dic", &error);
- g_assert_no_error (error);
-
- error = NULL;
- map = skk_kana_kan_score_map_new ("mk.model", dict, &error);
- g_assert_no_error (error);
-
- converter = skk_kana_kan_converter_new (dict, map);
-
- output = skk_kana_kan_converter_convert (converter, "かなかんじへんかんのれい");
- printf ("%s\n", output);
- g_free (output);
-
- output = skk_kana_kan_converter_convert (converter, "かなからかんじにへんかん");
- printf ("%s\n", output);
- g_free (output);
-
- g_object_unref (converter);
- g_object_unref (map);
- g_object_unref (dict);
-}
-
-int
-main (int argc, char **argv) {
- g_type_init ();
- skk_init ();
- g_test_init (&argc, &argv, NULL);
- g_test_add_func ("/libskk/kana-kan", kana_kan);
- return g_test_run ();
-}
Please sign in to comment.
Something went wrong with that request. Please try again.