Skip to content

Commit

Permalink
Fast segmentation (#229)
Browse files Browse the repository at this point in the history
Improves text segmentation engine by switching from a regex-based approach to a state table-based approach.
  • Loading branch information
camertron committed Dec 1, 2019
2 parents 906aecf + 20a49a8 commit ea16c16
Show file tree
Hide file tree
Showing 56 changed files with 16,080 additions and 2,317 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# TwitterCldr Changelog

### 5.2.0 (December 1st, 2019)
* Improve performance of the text segmentation algorithm.
- Break engine now uses state tables from ICU instead of regular expressions.
- It was... embarassing how slow it was before.
* Added support for line and grapheme cluster segmentation.

### 5.1.0 (November 21st, 2019)
* Upgrade to CLDR v36, ICU 65.1, and Emoji 12.1.
* Full timezone support in formatted dates and times (eg. "Eastern Standard Time" instead of simply "UTC").
Expand Down
10 changes: 5 additions & 5 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,11 @@ namespace :update do
TwitterCldr::Resources::CollationTestsImporter.new.import
end

desc 'Import text segmentation rules'
task :segment_rules do
TwitterCldr::Resources::SegmentRulesImporter.new.import
end

desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
task :bidi_tests do
TwitterCldr::Resources::BidiTestImporter.new.import
Expand Down Expand Up @@ -195,11 +200,6 @@ namespace :update do
TwitterCldr::Resources::TransformTestsImporter.new.import
end

desc 'Import segment exceptions'
task :segment_exceptions do
TwitterCldr::Resources::Uli::SegmentExceptionsImporter.new.import
end

desc 'Import segment tests'
task :segment_tests do
TwitterCldr::Resources::SegmentTestsImporter.new.import
Expand Down
1 change: 1 addition & 0 deletions lib/twitter_cldr.rb
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ module TwitterCldr
def_delegator :resources, :resource_exists?
def_delegator :resources, :locale_resource_exists?
def_delegator :resources, :absolute_resource_path
def_delegator :resources, :resource_file_path

class << self

Expand Down
10 changes: 2 additions & 8 deletions lib/twitter_cldr/resources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ module Resources
autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
autoload :Properties, 'twitter_cldr/resources/properties'
autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
Expand All @@ -37,7 +38,6 @@ module Resources
autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
autoload :UnicodeFileParser, 'twitter_cldr/resources/unicode_file_parser'
autoload :UnicodePropertyAliasesImporter, 'twitter_cldr/resources/unicode_property_aliases_importer'
autoload :Uli, 'twitter_cldr/resources/uli'
autoload :ValidityDataImporter, 'twitter_cldr/resources/validity_data_importer'

class << self
Expand All @@ -60,6 +60,7 @@ def standard_importer_classes
NumberFormatsImporter,
PostalCodesImporter,
RbnfTestImporter,
SegmentRulesImporter,
SegmentTestsImporter,
TailoringImporter,
TerritoriesImporter,
Expand All @@ -74,12 +75,6 @@ def standard_importer_classes
]
end

def uli_importer_classes
@uli_importer_classes ||= [
Uli::SegmentExceptionsImporter
]
end

def property_importer_classes
@property_importer_classes ||= [
Properties::AgePropertyImporter,
Expand Down Expand Up @@ -107,7 +102,6 @@ def property_importer_classes
def importer_classes
@importer_classes ||=
standard_importer_classes +
uli_importer_classes +
property_importer_classes
end

Expand Down
10 changes: 6 additions & 4 deletions lib/twitter_cldr/resources/loader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ def preload_all_resources
nil
end

def resource_file_path(path)
file = File.join(*path.map(&:to_s))
file << '.yml' unless file.end_with?('.yml')
file
end

private

def locale_resource_path(locale, resource_name)
Expand All @@ -85,10 +91,6 @@ def resources_cache
end
end

def resource_file_path(path)
"#{File.join(*path.map(&:to_s))}.yml"
end

def load_resource(path, merge_custom = true)
base = YAML.load(read_resource_file(path))
custom_path = File.join("custom", path)
Expand Down
1 change: 0 additions & 1 deletion lib/twitter_cldr/resources/locales_resources_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ class LocalesResourcesImporter < Importer
currency_digits_and_rounding
rbnf_root
numbering_systems
segments_root
territories_containment
likely_subtags
metazones
Expand Down
202 changes: 202 additions & 0 deletions lib/twitter_cldr/resources/segment_rules_importer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'base64'
require 'fileutils'
require 'nokogiri'
require 'yaml'

module TwitterCldr
module Resources

class SegmentRulesImporter < Importer

# @TODO: moar boundary types
BOUNDARY_TYPES = {
'word' => 'word',
'sentence' => 'sentence',
'grapheme' => 'grapheme',
'line' => 'line' # loose, normal, strict
}.freeze

TYPES_TO_ATTRS = {
'word' => 'WordBreak',
'sentence' => 'SentenceBreak',
'grapheme' => 'GraphemeClusterBreak',
'line' => 'LineBreak'
}.freeze

Locale = TwitterCldr::Shared::Locale

StateTable = TwitterCldr::Segmentation::StateTable
StatusTable = TwitterCldr::Segmentation::StatusTable
CategoryTable = TwitterCldr::Segmentation::CategoryTable

requirement :icu, Versions.icu_version
requirement :cldr, Versions.cldr_version
output_path File.join('shared', 'segments')
ruby_engine :jruby

def execute
each_locale do |locale, doc|
BOUNDARY_TYPES.each do |kind, icu_kind|
seg = doc.xpath(
"//ldml/segmentations/segmentation[@type=\"#{TYPES_TO_ATTRS[kind]}\"]"
)

rule_data = rule_data_for(icu_kind, locale, seg)

unless rule_data.empty?
output_dir = File.join(output_path, 'rules', locale)
output_file = File.join(output_dir, "#{kind}.yml")
FileUtils.mkdir_p(output_dir)
File.write(output_file, YAML.dump(rule_data))
end

suppressions = suppressions_for(icu_kind, locale, seg)

unless suppressions.empty?
output_dir = File.join(output_path, 'suppressions', locale)
output_file = File.join(output_dir, "#{kind}.yml")
FileUtils.mkdir_p(output_dir)
File.write(output_file, YAML.dump(suppressions))
end
end
end
end

private

def each_locale
return to_enum(__method__) unless block_given?

pattern = File.join(requirements[:cldr].common_path, 'segments', '*.xml')

Dir.glob(pattern).each do |file, ret|
locale = File.basename(file).chomp('.xml').tr('_', '-')
yield locale, Nokogiri::XML(File.read(file))
end
end

def rule_data_for(kind, locale, doc)
vars = doc.xpath('variables/variable')
rules = doc.xpath('segmentRules/rule')
result = {}

unless vars.empty? && rules.empty?
result.merge!(encode_rbbi_data(rbbi_data_for(kind, locale)))
end

result
end

def suppressions_for(kind, locale, doc)
suppressions = doc.xpath('suppressions/suppression').map(&:text)
return {} if suppressions.empty?

encode_suppressions(suppressions)
end

def encode_rbbi_data(data)
{
metadata: metadata_from(data.fHeader),
forward_table: StateTable.new(data.fFTable.fTable.to_a, data.fFTable.fFlags).dump16,
backward_table: StateTable.new(data.fRTable.fTable.to_a, data.fRTable.fFlags).dump16,
status_table: StatusTable.new(data.fStatusTable.to_a).dump,
category_table: encode_trie(data.fTrie), # this really isn't a trie
}
end

def metadata_from(header)
{ category_count: header.fCatCount }
end

def encode_suppressions(suppressions)
forwards_trie = TwitterCldr::Utils::Trie.new
backwards_trie = TwitterCldr::Utils::Trie.new

suppressions.each do |suppression|
forwards_trie.add(suppression.codepoints, true)
backwards_trie.add(suppression.reverse.codepoints, true)
end

{
forwards_trie: Marshal.dump(forwards_trie),
backwards_trie: Marshal.dump(backwards_trie)
}
end

def encode_trie(trie)
arr = [].tap do |results|
iter = trie.iterator

while iter.hasNext
range = iter.next
results << range_to_a(range)

# this should be the last entry, but for some reason ICU returns
# one more out-of-order range past the Unicode max
break if range.endCodePoint == 0x10FFFF
end
end

# @TODO: Distinguish between the 16- and 32-bit flavors
CategoryTable.new(arr).dump16.strip
end

def range_to_a(range)
[range.startCodePoint, range.endCodePoint, range.value]
end

def rbbi_data_for(kind, locale)
bundle = bundle_for(ulocale_class.new(locale))
brkf_name = bundle.getStringWithFallback("boundaries/#{kind}")
buffer = icu_binary.getData("#{brkiter_name}/#{brkf_name}")
rbbi_data_wrapper.get(buffer)
end

def bundle_for(locale)
@bundle ||= resource_bundle.getBundleInstance(brkiter_base_name, locale, locale_root)
end

def brkiter_name
@brkiter_name ||= icu_data.const_get(:ICU_BRKITR_NAME)
end

def brkiter_base_name
@brkiter_base_name ||= icu_data.const_get(:ICU_BRKITR_BASE_NAME)
end

def locale_root
@locale_root ||= resource_bundle.const_get(:OpenType).const_get(:LOCALE_ROOT)
end

def rbbi_data_wrapper
@rbbi_data_wrapper ||= requirements[:icu].get_class('com.ibm.icu.impl.RBBIDataWrapper')
end

def icu_binary
@icu_binary ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUBinary')
end

def icu_data
@icu_data ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUData')
end

def resource_bundle
@bundle_class ||= requirements[:icu].get_class('com.ibm.icu.impl.ICUResourceBundle')
end

def ulocale_class
@ulocale_class ||= requirements[:icu].get_class('com.ibm.icu.util.ULocale')
end

def output_path
params[:output_path]
end

end
end
end
4 changes: 3 additions & 1 deletion lib/twitter_cldr/resources/segment_tests_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ class SegmentTestsImporter < Importer

TEST_FILES = [
'ucd/auxiliary/WordBreakTest.txt',
'ucd/auxiliary/SentenceBreakTest.txt'
'ucd/auxiliary/SentenceBreakTest.txt',
'ucd/auxiliary/GraphemeBreakTest.txt',
'ucd/auxiliary/LineBreakTest.txt'
]

requirement :unicode, Versions.unicode_version, TEST_FILES
Expand Down
12 changes: 0 additions & 12 deletions lib/twitter_cldr/resources/uli.rb

This file was deleted.

Loading

0 comments on commit ea16c16

Please sign in to comment.