Add dictionary-based segmentation support (#230)

twitter · Jan 5, 2020 · 8bfecbc · 8bfecbc
2 parents ea16c16 + 68937b9
commit 8bfecbc
Show file tree

Hide file tree

Showing 111 changed files with 60,755 additions and 253 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,9 +6,10 @@ rvm:
   - 2.1.10
   - 2.2.10
   - 2.3.8
-  - 2.4.6
-  - 2.5.5
-  - 2.6.3
+  - 2.4.9
+  - 2.5.7
+  - 2.6.5
+  - 2.7.0
   - jruby-head
 matrix:
   allow_failures:

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,17 @@
 # TwitterCldr Changelog
 
+### 5.3.0 (January 4th, 2020)
+* Adds support for Lao, Khmer, and Burmese.
+* Adds support for dictionary-based word segmentation.
+  - Scripts that don't use spaces as word delimiters have to be segmented using a dictionary.
+  - Supported scripts now include Chinese, Japanese, Korean, Lao, Thai, Khmer, and Burmese.
+* Adds Ruby 2.7 to the build matrix.
+
 ### 5.2.0 (December 1st, 2019)
 * Improve performance of the text segmentation algorithm.
   - Break engine now uses state tables from ICU instead of regular expressions.
   - It was... embarassing how slow it was before.
-* Added support for line and grapheme cluster segmentation.
+* Adds support for line and grapheme cluster segmentation.
 
 ### 5.1.0 (November 21st, 2019)
 * Upgrade to CLDR v36, ICU 65.1, and Emoji 12.1.

diff --git a/Gemfile b/Gemfile
@@ -23,10 +23,6 @@ group :development do
   gem 'ruby-cldr', github: 'camertron/ruby-cldr', branch: 'mapzones' # 'svenfuchs/ruby-cldr'
   gem 'i18n'
   gem 'cldr-plurals', '~> 1.0'
-
-  gem 'rest-client', '~> 1.8'
-
-  gem 'parallel'
 end
 
 group :test do

diff --git a/Rakefile b/Rakefile
@@ -57,16 +57,22 @@ task :update do
 end
 
 task :add_locale, :locale do |_, args|
+  locales = [args[:locale]] + args.extras
+
   File.write(
     TwitterCldr::SUPPORTED_LOCALES_FILE,
     YAML.dump(
-      (TwitterCldr::SUPPORTED_LOCALES + [args[:locale]]).map(&:to_sym).uniq.sort
+      (TwitterCldr::SUPPORTED_LOCALES + locales).map(&:to_sym).uniq.sort
     )
   )
 
   klasses = TwitterCldr::Resources.locale_based_importer_classes_for_ruby_engine
-  instances = klasses.map { |klass| klass.new(locales: [args[:locale]]) }
-  TwitterCldr::Resources::ImportResolver.new(instances).import
+  instances = klasses.map { |klass| klass.new(locales: locales) }
+  resolver = TwitterCldr::Resources::ImportResolver.new(
+    instances, allow_missing_dependencies: true
+  )
+
+  resolver.import
 end
 
 # add_locale and update_locale do the same thing
@@ -160,6 +166,16 @@ namespace :update do
     TwitterCldr::Resources::SegmentRulesImporter.new.import
   end
 
+  desc 'Import segmentation dictionaries'
+  task :segment_dictionaries do
+    TwitterCldr::Resources::SegmentDictionariesImporter.new.import
+  end
+
+  desc 'Import segment tests'
+  task :segment_tests do
+    TwitterCldr::Resources::SegmentTestsImporter.new.import
+  end
+
   desc 'Import (generate) bidi tests (should be executed using JRuby 1.7 in 1.9 mode)'
   task :bidi_tests do
     TwitterCldr::Resources::BidiTestImporter.new.import
@@ -200,11 +216,6 @@ namespace :update do
     TwitterCldr::Resources::TransformTestsImporter.new.import
   end
 
-  desc 'Import segment tests'
-  task :segment_tests do
-    TwitterCldr::Resources::SegmentTestsImporter.new.import
-  end
-
   desc 'Import hyphenation dictionaries'
   task :hyphenation_dictionaries do
     TwitterCldr::Resources::HyphenationImporter.new.import

diff --git a/lib/twitter_cldr/normalization.rb b/lib/twitter_cldr/normalization.rb
@@ -14,12 +14,25 @@ module Normalization
     class << self
 
       def normalize(string, options = {})
-        form = options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
+        validate_form(form = extract_form_from(options))
+        Eprun.normalize(string, form)
+      end
+
+      def normalized?(string, options = {})
+        validate_form(form = extract_form_from(options))
+        Eprun.normalized?(string, form)
+      end
+
+      private
+
+      def extract_form_from(options)
+        options.fetch(:using, DEFAULT_NORMALIZER).to_s.downcase.to_sym
+      end
 
-        if VALID_NORMALIZERS.include?(form)
-          Eprun.normalize(string, form)
-        else
-          raise ArgumentError.new("#{form.inspect} is not a valid normalizer (valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
+      def validate_form(form)
+        unless VALID_NORMALIZERS.include?(form)
+          raise ArgumentError.new("#{form.inspect} is not a valid normalizer "\
+            "(valid normalizers are #{VALID_NORMALIZERS.join(', ')})")
         end
       end
 

diff --git a/lib/twitter_cldr/resources.rb b/lib/twitter_cldr/resources.rb
@@ -22,12 +22,13 @@ module Resources
     autoload :NumberFormatsImporter,          'twitter_cldr/resources/number_formats_importer'
     autoload :PostalCodesImporter,            'twitter_cldr/resources/postal_codes_importer'
     autoload :Properties,                     'twitter_cldr/resources/properties'
+    autoload :SegmentDictionariesImporter,    'twitter_cldr/resources/segment_dictionaries_importer'
     autoload :SegmentRulesImporter,           'twitter_cldr/resources/segment_rules_importer'
+    autoload :SegmentTestsImporter,           'twitter_cldr/resources/segment_tests_importer'
     autoload :RbnfTestImporter,               'twitter_cldr/resources/rbnf_test_importer'
     autoload :ReadmeRenderer,                 'twitter_cldr/resources/readme_renderer'
     autoload :RegexpAstGenerator,             'twitter_cldr/resources/regexp_ast_generator'
     autoload :Requirements,                   'twitter_cldr/resources/requirements'
-    autoload :SegmentTestsImporter,           'twitter_cldr/resources/segment_tests_importer'
     autoload :TailoringImporter,              'twitter_cldr/resources/tailoring_importer'
     autoload :TerritoriesImporter,            'twitter_cldr/resources/territories_importer'
     autoload :TimezonesImporter,              'twitter_cldr/resources/timezones_importer'
@@ -60,6 +61,7 @@ def standard_importer_classes
           NumberFormatsImporter,
           PostalCodesImporter,
           RbnfTestImporter,
+          SegmentDictionariesImporter,
           SegmentRulesImporter,
           SegmentTestsImporter,
           TailoringImporter,

diff --git a/lib/twitter_cldr/resources/import_resolver.rb b/lib/twitter_cldr/resources/import_resolver.rb
@@ -5,10 +5,11 @@ module Resources
     class ImportResolver
       include TSort
 
-      attr_reader :importers
+      attr_reader :importers, :options
 
-      def initialize(importers = Resources.importer_classes_for_ruby_engine)
+      def initialize(importers = Resources.importer_classes_for_ruby_engine, options = {})
         @importers = importers
+        @options = options
       end
 
       def import
@@ -28,7 +29,12 @@ def tsort_each_node(&block)
 
       def tsort_each_child(instance, &block)
         deps_for(instance).map do |dep_class|
-          yield instances.find { |ins| ins.class == dep_class }
+          dep = instances.find { |ins| ins.class == dep_class }
+          yield dep if dep
+
+          unless options[:allow_missing_dependencies]
+            raise "Could not find dependency #{dep_class.name}"
+          end
         end
       end
 
@@ -39,6 +45,8 @@ def check_unmet_deps
       end
 
       def check_unmet_instance_deps(instance)
+        return if options[:allow_missing_dependencies]
+
         unmet_deps = unmet_deps_for(instance)
 
         unless unmet_deps.empty?

diff --git a/lib/twitter_cldr/resources/loader.rb b/lib/twitter_cldr/resources/loader.rb
@@ -10,6 +10,8 @@ class ResourceLoadError < StandardError; end
 
     class Loader
 
+      VALID_EXTS = %w(.yml .dump).freeze
+
       def get_resource(*path)
         resources_cache[resource_file_path(path)]
       end
@@ -75,7 +77,7 @@ def preload_all_resources
 
       def resource_file_path(path)
         file = File.join(*path.map(&:to_s))
-        file << '.yml' unless file.end_with?('.yml')
+        file << '.yml' unless VALID_EXTS.include?(File.extname(file))
         file
       end
 
@@ -92,6 +94,17 @@ def resources_cache
       end
 
       def load_resource(path, merge_custom = true)
+        case File.extname(path)
+          when '.yml'
+            load_yaml_resource(path, merge_custom)
+          when '.dump'
+            load_marshalled_resource(path, merge_custom)
+          else
+            load_raw_resource(path, merge_custom)
+        end
+      end
+
+      def load_yaml_resource(path, merge_custom = true)
         base = YAML.load(read_resource_file(path))
         custom_path = File.join("custom", path)
 
@@ -102,6 +115,14 @@ def load_resource(path, merge_custom = true)
         base
       end
 
+      def load_marshalled_resource(path, _merge_custom = :unused)
+        Marshal.load(read_resource_file(path))
+      end
+
+      def load_raw_resource(path, _merge_custom = :unused)
+        read_resource_file(path)
+      end
+
       def custom_resource_exists?(custom_path)
         File.exist?(
           File.join(TwitterCldr::RESOURCES_DIR, custom_path)

diff --git a/lib/twitter_cldr/resources/locales_resources_importer.rb b/lib/twitter_cldr/resources/locales_resources_importer.rb
@@ -56,13 +56,6 @@ def output_path
         params.fetch(:output_path)
       end
 
-      def move_segments_root_file
-        old_file_path = File.join(output_path, *%w(shared segments_root.yml))
-        new_file_path = File.join(output_path, *%w(shared segments segments_root.yml))
-        FileUtils.mkdir_p(File.dirname(new_file_path))
-        FileUtils.move(old_file_path, new_file_path)
-      end
-
       def import_components
         locales = Set.new
 
@@ -100,8 +93,6 @@ def import_components
         Cldr::Export.export(export_args) do |component, locale, path|
           deep_symbolize(path)
         end
-
-        move_segments_root_file
       end
 
       def components_for(locale)

diff --git a/lib/twitter_cldr/resources/postal_codes_importer.rb b/lib/twitter_cldr/resources/postal_codes_importer.rb
@@ -3,8 +3,8 @@
 # Copyright 2012 Twitter, Inc
 # http://www.apache.org/licenses/LICENSE-2.0
 
-require 'rest-client'
 require 'json'
+require 'open-uri'
 require 'set'
 require 'yaml'
 
@@ -21,38 +21,36 @@ class PostalCodesImporter < Importer
       private
 
       def execute
-        File.open(File.join(output_path, 'postal_codes.yml'), 'w') do |output|
-          output.write(YAML.dump(load))
-        end
+        data = YAML.dump(fetch_data)
+        File.write(File.join(output_path, 'postal_codes.yml'), data)
+        puts
       end
 
       def output_path
         params.fetch(:output_path)
       end
 
-      def load
+      def fetch_data
         territories = Set.new
 
         each_territory.each_with_object({}) do |territory, ret|
-          next unless regex = get_regex_for(territory)
-
-          ret[territory] = {
-            regex: Regexp.compile(regex),
-            ast: TwitterCldr::Utils::RegexpAst.dump(
-              RegexpAstGenerator.generate(regex)
-            )
-          }
+          if regex = get_regex_for(territory)
+            ret[territory] = {
+              regex: Regexp.compile(regex),
+              ast: TwitterCldr::Utils::RegexpAst.dump(
+                RegexpAstGenerator.generate(regex)
+              )
+            }
+          end
 
           territories.add(territory)
           STDOUT.write("\rImported postal codes for #{territory}, #{territories.size} of #{territory_count} total")
         end
-
-        puts
       end
 
       def get_regex_for(territory)
-        result = RestClient.get("#{BASE_URL}#{territory.to_s.upcase}")
-        data = JSON.parse(result.body)
+        result = URI.open("#{BASE_URL}#{territory.to_s.upcase}").read
+        data = JSON.parse(result)
         data['zip']
       end
 
@@ -61,12 +59,10 @@ def territory_count
       end
 
       def each_territory
-        if block_given?
-          TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
-            yield territory
-          end
-        else
-          to_enum(__method__)
+        return to_enum(__method__) unless block_given?
+
+        TwitterCldr::Shared::Territories.all.each_pair do |territory, _|
+          yield territory
         end
       end