Skip to content

Commit

Permalink
Fixes for the cj break engine
Browse files Browse the repository at this point in the history
  • Loading branch information
camertron committed Apr 17, 2018
1 parent 882354a commit 31cccf0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 21 deletions.
8 changes: 5 additions & 3 deletions lib/twitter_cldr/segmentation/break_iterator.rb
Expand Up @@ -119,9 +119,11 @@ def each_word_boundary(str, &block)
end

# break with normal, regex-based rule set
rule_set.each_boundary(cursor, stop) do |boundary|
last_boundary = boundary
yield boundary
if stop > cursor.position
rule_set.each_boundary(cursor, stop) do |boundary|
last_boundary = boundary
yield boundary
end
end

# make sure we're not at the end of the road after breaking the
Expand Down
39 changes: 21 additions & 18 deletions lib/twitter_cldr/segmentation/cj_break_engine.rb
Expand Up @@ -42,23 +42,26 @@ def word_set
end

def divide_up_dictionary_range(cursor, end_pos)
best_snlp = Array.new(cursor.length + 1) { LARGE_NUMBER }
prev = Array.new(cursor.length + 1) { -1 }
input_length = end_pos - cursor.position
best_snlp = Array.new(input_length + 1) { LARGE_NUMBER }
prev = Array.new(input_length + 1) { -1 }

best_snlp[0] = 0
start_pos = cursor.position
is_prev_katakana = false

until cursor.eos?
if best_snlp[cursor.position] == LARGE_NUMBER
until cursor.position >= end_pos
idx = cursor.position - start_pos

if best_snlp[idx] == LARGE_NUMBER
cursor.advance
next
end

max_search_length = if cursor.position + MAX_WORD_SIZE < cursor.length
max_search_length = if cursor.position + MAX_WORD_SIZE < end_pos
MAX_WORD_SIZE
else
cursor.length - cursor.position
end_pos - cursor.position
end

count, values, lengths, _ = dictionary.matches(
Expand All @@ -72,11 +75,11 @@ def divide_up_dictionary_range(cursor, end_pos)
end

count.times do |j|
new_snlp = best_snlp[cursor.position] + values[j]
new_snlp = best_snlp[idx] + values[j]

if new_snlp < best_snlp[lengths[j] + cursor.position]
best_snlp[lengths[j] + cursor.position] = new_snlp
prev[lengths[j] + cursor.position] = cursor.position
if new_snlp < best_snlp[lengths[j] + idx]
best_snlp[lengths[j] + idx] = new_snlp
prev[lengths[j] + idx] = idx
end
end

Expand All @@ -93,17 +96,17 @@ def divide_up_dictionary_range(cursor, end_pos)
j = cursor.position + 1
cursor.advance

while j < cursor.length && (j - cursor.position) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.current_cp)
while j < end_pos && (j - idx) < MAX_KATAKANA_GROUP_LENGTH && is_katakana?(cursor.current_cp)
cursor.advance
j += 1
end

if (j - cursor.position) < MAX_KATAKANA_GROUP_LENGTH
new_snlp = best_snlp[cursor.position] + get_katakana_cost(j - cursor.position)
if (j - idx) < MAX_KATAKANA_GROUP_LENGTH
new_snlp = best_snlp[idx] + get_katakana_cost(j - idx)

if new_snlp < best_snlp[j]
best_snlp[j] = new_snlp
prev[j] = cursor.position
prev[j] = idx
end
end
end
Expand All @@ -113,13 +116,13 @@ def divide_up_dictionary_range(cursor, end_pos)

t_boundary = []

if best_snlp[cursor.length] == LARGE_NUMBER
t_boundary << cursor.length
if best_snlp[input_length] == LARGE_NUMBER
t_boundary << end_pos
else
idx = cursor.length
idx = end_pos - start_pos

while idx > 0
t_boundary << idx
t_boundary << idx + start_pos
idx = prev[idx]
end
end
Expand Down

0 comments on commit 31cccf0

Please sign in to comment.