Skip to content

Commit

Permalink
Add rake task for updating canonical compositions.
Browse files Browse the repository at this point in the history
  • Loading branch information
KL-7 committed Jul 30, 2012
1 parent e45196c commit e51a2a1
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 27 deletions.
5 changes: 5 additions & 0 deletions Rakefile
Expand Up @@ -65,6 +65,11 @@ namespace :resources do
'./resources/unicode_data'
).import
end

desc 'Updates canonical compositions resource'
task :canonical_compositions do
TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
end
end
end

Expand Down
6 changes: 3 additions & 3 deletions lib/twitter_cldr/normalization/nfkc.rb
Expand Up @@ -80,12 +80,12 @@ def compose_normal(code_points)

unless blocked
# do a reverse-lookup for the decomposed code points
decomp_data = TwitterCldr::Shared::CodePoint.for_decomposition([code_points[starter_index], code_point])
composite = TwitterCldr::Shared::CodePoint.for_canonical_decomposition([code_points[starter_index], code_point])

# check if two code points are canonically equivalent
if decomp_data && !decomp_data.excluded_from_composition?
if composite && !composite.excluded_from_composition?
# combine the characters
code_points[starter_index] = decomp_data.code_point
code_points[starter_index] = composite.code_point
code_points.delete_at(index)
index -= 1
end
Expand Down
9 changes: 5 additions & 4 deletions lib/twitter_cldr/resources.rb
Expand Up @@ -5,9 +5,10 @@

module TwitterCldr
module Resources
autoload :Loader, 'twitter_cldr/resources/loader'
autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
autoload :TriesDumper, 'twitter_cldr/resources/tries_dumper'
autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
autoload :CanonicalCompositionsUpdater, 'twitter_cldr/resources/canonical_compositions_updater'
autoload :Loader, 'twitter_cldr/resources/loader'
autoload :TailoringImporter, 'twitter_cldr/resources/tailoring_importer'
autoload :TriesDumper, 'twitter_cldr/resources/tries_dumper'
autoload :UnicodeDataImporter, 'twitter_cldr/resources/unicode_data_importer'
end
end
53 changes: 53 additions & 0 deletions lib/twitter_cldr/resources/canonical_compositions_updater.rb
@@ -0,0 +1,53 @@
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

#require 'yaml'

module TwitterCldr
module Resources

class CanonicalCompositionsUpdater

CODE_POINT_MAX = 0x10FFFF

# Arguments:
#
# output_path - output directory for generated YAML file
#
def initialize(output_path)
@output_path = output_path
end

def update
File.open(File.join(@output_path, 'canonical_compositions.yml'), 'w') do |output|
YAML.dump(generate_compositions, output)
end
end

private

def generate_compositions
(1..CODE_POINT_MAX).inject({}) do |memo, code_point|
code_point_data = TwitterCldr::Shared::CodePoint.find(code_point)

if code_point_data && !code_point_data.compatibility_decomposition? && code_point_data.decomposition && !code_point_data.decomposition.empty?
memo[code_point_data.decomposition] = code_point
end

log_progress(code_point, memo.size)

memo
end
end

def log_progress(code_point, compositions_count)
$stdout.write("\r#{(100.0 * code_point / CODE_POINT_MAX).round}% complete, found #{compositions_count} canonical compositions")
$stdout.write("\n") if code_point == CODE_POINT_MAX
end

end

end
end
13 changes: 5 additions & 8 deletions lib/twitter_cldr/shared/code_point.rb
Expand Up @@ -67,11 +67,12 @@ def find(code_point)
CodePoint.new(*code_point_data) if code_point_data
end

def for_decomposition(code_points)
@decomposition_map ||= TwitterCldr.get_resource(:unicode_data, :decomposition_map)
key = code_points.map(&method(:code_point_to_string)).join(' ').to_sym
def for_canonical_decomposition(code_points)
find(canonical_compositions[code_points]) if canonical_compositions.has_key?(code_points)
end

find(@decomposition_map[key].hex) if @decomposition_map.include?(key)
def canonical_compositions
@canonical_compositions ||= TwitterCldr.get_resource(:unicode_data, :canonical_compositions)
end

def hangul_type(code_point)
Expand Down Expand Up @@ -122,10 +123,6 @@ def get_range_start(code_point, block_data)
end
end

def code_point_to_string(code_point)
code_point.to_s(16).rjust(4, '0').upcase
end

end
end
end
Expand Down
24 changes: 12 additions & 12 deletions spec/shared/code_point_spec.rb
Expand Up @@ -107,38 +107,38 @@ def test_code_points_data(test_data)
end
end

describe "#for_decomposition" do
let(:decomposition_map) { { :"AAAA 0BBB" => "ABC" } }
describe "#for_canonical_decomposition" do
let(:canonical_compositions) { { [123, 456] => 789 } }

before(:each) do
# clear the decomposition map after each test so mocks/stubs work
CodePoint.instance_variable_set(:@decomposition_map, nil)
stub(CodePoint).find { |code_point| "I'm code point #{code_point.to_s(16).upcase}" }
CodePoint.instance_variable_set(:@canonical_compositions, nil)
stub(CodePoint).find { |code_point| "I'm code point #{code_point}" }
end

after(:each) do
after(:all) do
# clear the decomposition map after each test so mocks/stubs work
CodePoint.instance_variable_set(:@decomposition_map, nil)
CodePoint.instance_variable_set(:@canonical_compositions, nil)
end

context "with a stubbed decomposition map" do
before(:each) do
mock(TwitterCldr).get_resource(:unicode_data, :decomposition_map) { decomposition_map }
mock(TwitterCldr).get_resource(:unicode_data, :canonical_compositions) { canonical_compositions }
end

it "should return a code point with the correct value" do
CodePoint.for_decomposition([0xAAAA, 0xBBB]).should == "I'm code point ABC"
CodePoint.for_canonical_decomposition([123, 456]).should == "I'm code point 789"
end

it "should return nil if no decomposition mapping exists" do
CodePoint.for_decomposition([0xA0]).should be_nil
CodePoint.for_canonical_decomposition([987]).should be_nil
end
end

it "should cache the decomposition map" do
mock(TwitterCldr).get_resource(:unicode_data, :decomposition_map) { decomposition_map }.once
CodePoint.for_decomposition([0xA0]).should be_nil
CodePoint.for_decomposition([0xA0]).should be_nil
mock(TwitterCldr).get_resource(:unicode_data, :canonical_compositions) { canonical_compositions }.once
CodePoint.for_canonical_decomposition([0xA0]).should be_nil
CodePoint.for_canonical_decomposition([0xA0]).should be_nil
end
end

Expand Down

0 comments on commit e51a2a1

Please sign in to comment.