Skip to content

Commit

Permalink
Add dictionary-based segmentation support (#230)
Browse files Browse the repository at this point in the history
  • Loading branch information
camertron committed Jan 5, 2020
2 parents ea16c16 + 68937b9 commit 8bfecbc
Show file tree
Hide file tree
Showing 111 changed files with 60,755 additions and 253 deletions.
7 changes: 4 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ rvm:
- 2.1.10
- 2.2.10
- 2.3.8
- 2.4.6
- 2.5.5
- 2.6.3
- 2.4.9
- 2.5.7
- 2.6.5
- 2.7.0
- jruby-head
matrix:
allow_failures:
Expand Down
9 changes: 8 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
# TwitterCldr Changelog

### 5.3.0 (January 4th, 2020)
* Adds support for Lao, Khmer, and Burmese.
* Adds support for dictionary-based word segmentation.
- Scripts that don't use spaces as word delimiters have to be segmented using a dictionary.
- Supported scripts now include Chinese, Japanese, Korean, Lao, Thai, Khmer, and Burmese.
* Adds Ruby 2.7 to the build matrix.

### 5.2.0 (December 1st, 2019)
* Improve performance of the text segmentation algorithm.
- Break engine now uses state tables from ICU instead of regular expressions.
- It was... embarassing how slow it was before.
* Added support for line and grapheme cluster segmentation.
* Adds support for line and grapheme cluster segmentation.

### 5.1.0 (November 21st, 2019)
* Upgrade to CLDR v36, ICU 65.1, and Emoji 12.1.
Expand Down
4 changes: 0 additions & 4 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ group :development do
gem 'ruby-cldr', github: 'camertron/ruby-cldr', branch: 'mapzones' # 'svenfuchs/ruby-cldr'
gem 'i18n'
gem 'cldr-plurals', '~> 1.0'

gem 'rest-client', '~> 1.8'

gem 'parallel'
end

group :test do
Expand Down
27 changes: 19 additions & 8 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,16 +57,22 @@ task :update do
end

task :add_locale, :locale do |_, args|
locales = [args[:locale]] + args.extras

File.write(
TwitterCldr::SUPPORTED_LOCALES_FILE,
YAML.dump(
(TwitterCldr::SUPPORTED_LOCALES + [args[:locale]]).map(&:to_sym).uniq.sort
(TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
)
)

klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
instances = klasses.map { |klass| klass.new(locales: [args[:locale]]) }
TwitterCldr::Resources::ImportResolver.new(instances).import
instances = klasses.map { |klass| klass.new(locales: locales) }
resolver = TwitterCldr::Resources::ImportResolver.new(
instances, allow_missing_dependencies: true
)

resolver.import
end

# add_locale and update_locale do the same thing
Expand Down Expand Up @@ -160,6 +166,16 @@ namespace :update do
TwitterCldr::Resources::SegmentRulesImporter.new.import
end

desc 'Import segmentation dictionaries'
task :segment_dictionaries do
TwitterCldr::Resources::SegmentDictionariesImporter.new.import
end

desc 'Import segment tests'
task :segment_tests do
TwitterCldr::Resources::SegmentTestsImporter.new.import
end

desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
task :bidi_tests do
TwitterCldr::Resources::BidiTestImporter.new.import
Expand Down Expand Up @@ -200,11 +216,6 @@ namespace :update do
TwitterCldr::Resources::TransformTestsImporter.new.import
end

desc 'Import segment tests'
task :segment_tests do
TwitterCldr::Resources::SegmentTestsImporter.new.import
end

desc 'Import hyphenation dictionaries'
task :hyphenation_dictionaries do
TwitterCldr::Resources::HyphenationImporter.new.import
Expand Down
23 changes: 18 additions & 5 deletions lib/twitter_cldr/normalization.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,25 @@ module Normalization
class << self

def normalize(string, options = {})
form = options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
validate_form(form = extract_form_from(options))
Eprun.normalize(string, form)
end

def normalized?(string, options = {})
validate_form(form = extract_form_from(options))
Eprun.normalized?(string, form)
end

private

def extract_form_from(options)
options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
end

if VALID_NORMALIZERS.include?(form)
Eprun.normalize(string, form)
else
raise ArgumentError.new("#{form.inspect} is not a valid normalizer (valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
def validate_form(form)
unless VALID_NORMALIZERS.include?(form)
raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
"(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
end
end

Expand Down
4 changes: 3 additions & 1 deletion lib/twitter_cldr/resources.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ module Resources
autoload :NumberFormatsImporter, 'twitter_cldr/resources/number_formats_importer'
autoload :PostalCodesImporter, 'twitter_cldr/resources/postal_codes_importer'
autoload :Properties, 'twitter_cldr/resources/properties'
autoload :SegmentDictionariesImporter, 'twitter_cldr/resources/segment_dictionaries_importer'
autoload :SegmentRulesImporter, 'twitter_cldr/resources/segment_rules_importer'
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
autoload :RbnfTestImporter, 'twitter_cldr/resources/rbnf_test_importer'
autoload :ReadmeRenderer, 'twitter_cldr/resources/readme_renderer'
autoload :RegexpAstGenerator, 'twitter_cldr/resources/regexp_ast_generator'
autoload :Requirements, 'twitter_cldr/resources/requirements'
autoload :SegmentTestsImporter, 'twitter_cldr/resources/segment_tests_importer'
autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
autoload :TerritoriesImporter, 'twitter_cldr/resources/territories_importer'
autoload :TimezonesImporter, 'twitter_cldr/resources/timezones_importer'
Expand Down Expand Up @@ -60,6 +61,7 @@ def standard_importer_classes
NumberFormatsImporter,
PostalCodesImporter,
RbnfTestImporter,
SegmentDictionariesImporter,
SegmentRulesImporter,
SegmentTestsImporter,
TailoringImporter,
Expand Down
14 changes: 11 additions & 3 deletions lib/twitter_cldr/resources/import_resolver.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ module Resources
class ImportResolver
include TSort

attr_reader :importers
attr_reader :importers, :options

def initialize(importers = Resources.importer_classes_for_ruby_engine)
def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
@importers = importers
@options = options
end

def import
Expand All @@ -28,7 +29,12 @@ def tsort_each_node(&block)

def tsort_each_child(instance, &block)
deps_for(instance).map do |dep_class|
yield instances.find { |ins| ins.class == dep_class }
dep = instances.find { |ins| ins.class == dep_class }
yield dep if dep

unless options[:allow_missing_dependencies]
raise "Could not find dependency #{dep_class.name}"
end
end
end

Expand All @@ -39,6 +45,8 @@ def check_unmet_deps
end

def check_unmet_instance_deps(instance)
return if options[:allow_missing_dependencies]

unmet_deps = unmet_deps_for(instance)

unless unmet_deps.empty?
Expand Down
23 changes: 22 additions & 1 deletion lib/twitter_cldr/resources/loader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ class ResourceLoadError < StandardError; end

class Loader

VALID_EXTS = %w(.yml .dump).freeze

def get_resource(*path)
resources_cache[resource_file_path(path)]
end
Expand Down Expand Up @@ -75,7 +77,7 @@ def preload_all_resources

def resource_file_path(path)
file = File.join(*path.map(&:to_s))
file << '.yml' unless file.end_with?('.yml')
file << '.yml' unless VALID_EXTS.include?(File.extname(file))
file
end

Expand All @@ -92,6 +94,17 @@ def resources_cache
end

def load_resource(path, merge_custom = true)
case File.extname(path)
when '.yml'
load_yaml_resource(path, merge_custom)
when '.dump'
load_marshalled_resource(path, merge_custom)
else
load_raw_resource(path, merge_custom)
end
end

def load_yaml_resource(path, merge_custom = true)
base = YAML.load(read_resource_file(path))
custom_path = File.join("custom", path)

Expand All @@ -102,6 +115,14 @@ def load_resource(path, merge_custom = true)
base
end

def load_marshalled_resource(path, _merge_custom = :unused)
Marshal.load(read_resource_file(path))
end

def load_raw_resource(path, _merge_custom = :unused)
read_resource_file(path)
end

def custom_resource_exists?(custom_path)
File.exist?(
File.join(TwitterCldr::RESOURCES_DIR, custom_path)
Expand Down
9 changes: 0 additions & 9 deletions lib/twitter_cldr/resources/locales_resources_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -56,13 +56,6 @@ def output_path
params.fetch(:output_path)
end

def move_segments_root_file
old_file_path = File.join(output_path, *%w(shared segments_root.yml))
new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
FileUtils.mkdir_p(File.dirname(new_file_path))
FileUtils.move(old_file_path, new_file_path)
end

def import_components
locales = Set.new

Expand Down Expand Up @@ -100,8 +93,6 @@ def import_components
Cldr::Export.export(export_args) do |component, locale, path|
deep_symbolize(path)
end

move_segments_root_file
end

def components_for(locale)
Expand Down
42 changes: 19 additions & 23 deletions lib/twitter_cldr/resources/postal_codes_importer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'rest-client'
require 'json'
require 'open-uri'
require 'set'
require 'yaml'

Expand All @@ -21,38 +21,36 @@ class PostalCodesImporter < Importer
private

def execute
File.open(File.join(output_path, 'postal_codes.yml'), 'w') do |output|
output.write(YAML.dump(load))
end
data = YAML.dump(fetch_data)
File.write(File.join(output_path, 'postal_codes.yml'), data)
puts
end

def output_path
params.fetch(:output_path)
end

def load
def fetch_data
territories = Set.new

each_territory.each_with_object({}) do |territory, ret|
next unless regex = get_regex_for(territory)

ret[territory] = {
regex: Regexp.compile(regex),
ast: TwitterCldr::Utils::RegexpAst.dump(
RegexpAstGenerator.generate(regex)
)
}
if regex = get_regex_for(territory)
ret[territory] = {
regex: Regexp.compile(regex),
ast: TwitterCldr::Utils::RegexpAst.dump(
RegexpAstGenerator.generate(regex)
)
}
end

territories.add(territory)
STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
end

puts
end

def get_regex_for(territory)
result = RestClient.get("#{BASE_URL}#{territory.to_s.upcase}")
data = JSON.parse(result.body)
result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
data = JSON.parse(result)
data['zip']
end

Expand All @@ -61,12 +59,10 @@ def territory_count
end

def each_territory
if block_given?
TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
yield territory
end
else
to_enum(__method__)
return to_enum(__method__) unless block_given?

TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
yield territory
end
end

Expand Down
Loading

0 comments on commit 8bfecbc

Please sign in to comment.