diff --git a/lib/twitter_cldr/core_ext/string.rb b/lib/twitter_cldr/core_ext/string.rb index 10f1020c6..9a786fd91 100644 --- a/lib/twitter_cldr/core_ext/string.rb +++ b/lib/twitter_cldr/core_ext/string.rb @@ -32,7 +32,7 @@ def normalize end def code_points - TwitterCldr::Normalizers::Base.string_to_code_points(@base_obj) + TwitterCldr::Utils::CodePoints.from_string(@base_obj) end def to_s diff --git a/lib/twitter_cldr/normalizers.rb b/lib/twitter_cldr/normalizers.rb index 2ef5340f3..7d2efb5be 100644 --- a/lib/twitter_cldr/normalizers.rb +++ b/lib/twitter_cldr/normalizers.rb @@ -5,7 +5,7 @@ module TwitterCldr module Normalizers - autoload :Base, 'twitter_cldr/normalizers/base' - autoload :NFD, 'twitter_cldr/normalizers/canonical/nfd' + autoload :NFD, 'twitter_cldr/normalizers/nfd' + autoload :NFKD, 'twitter_cldr/normalizers/nfkd' end end \ No newline at end of file diff --git a/lib/twitter_cldr/normalizers/base.rb b/lib/twitter_cldr/normalizers/base.rb deleted file mode 100644 index c3001b3fb..000000000 --- a/lib/twitter_cldr/normalizers/base.rb +++ /dev/null @@ -1,37 +0,0 @@ -# encoding: UTF-8 - -# Copyright 2012 Twitter, Inc -# http://www.apache.org/licenses/LICENSE-2.0 - -module TwitterCldr - module Normalizers - class Base - class << self - def code_point_to_char(code_point) - [code_point.upcase.hex].pack('U*') - end - - def char_to_code_point(char) - code_point = char.unpack('U*').first.to_s(16).upcase - code_point.rjust(4, '0') #Pad to at least 4 digits - end - - def chars_to_code_points(chars) - chars.map { |char| char_to_code_point(char) } - end - - def code_points_to_chars(code_points) - code_points.map { |code_point| code_point_to_char(code_point) } - end - - def string_to_code_points(str) - chars_to_code_points(str.chars.to_a) - end - - def code_points_to_string(code_points) - code_points.inject("") { |str, code_point| str << code_point_to_char(code_point); str } - end - end - end - end -end \ No newline at end of file diff --git a/lib/twitter_cldr/normalizers/canonical/nfd.rb b/lib/twitter_cldr/normalizers/canonical/nfd.rb deleted file mode 100644 index 715da37ee..000000000 --- a/lib/twitter_cldr/normalizers/canonical/nfd.rb +++ /dev/null @@ -1,133 +0,0 @@ -# encoding: UTF-8 - -# Copyright 2012 Twitter, Inc -# http://www.apache.org/licenses/LICENSE-2.0 - -module TwitterCldr - module Normalizers - class NFD < Base - - HANGUL_CONSTANTS = { - :SBase => 0xAC00, - :LBase => 0x1100, - :VBase => 0x1161, - :TBase => 0x11A7, - :LCount => 19, - :VCount => 21, - :TCount => 28, - :NCount => 588, # VCount * TCount - :Scount => 11172 # LCount * NCount - } - - class << self - - def normalize(string) - # Convert string to code points - code_points = string.split('').map { |char| char_to_code_point(char) } - - # Normalize code points - normalized_code_points = normalize_code_points(code_points) - - # Convert normalized code points back to string - normalized_code_points.map { |code_point| code_point_to_char(code_point) }.join - end - - def normalize_code_points(code_points) - code_points = code_points.map { |code_point| decompose code_point }.flatten - reorder(code_points) - end - - # Recursively replace the given code point with the values in its Decomposition_Mapping property. - def decompose(code_point) - unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point) - return code_point unless unicode_data - - decomposition_mapping = unicode_data.decomposition.split - - if unicode_data.name.include?('Hangul') - decompose_hangul(code_point) - # Return the code point if compatibility mapping or if no mapping exists - elsif decomposition_mapping.first =~ /<.*>/ || decomposition_mapping.empty? - code_point - else - decomposition_mapping.map do |decomposition_code_point| - decompose(decomposition_code_point) - end.flatten - end - end - - private - - # Special decomposition for Hangul syllables. - # Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf - def decompose_hangul(code_point) - s_index = code_point.hex - HANGUL_CONSTANTS[:SBase] - - l_index = s_index / HANGUL_CONSTANTS[:NCount] - v_index = (s_index % HANGUL_CONSTANTS[:NCount]) / HANGUL_CONSTANTS[:TCount] - t_index = s_index % HANGUL_CONSTANTS[:TCount] - - result = [] - - result << (HANGUL_CONSTANTS[:LBase] + l_index).to_s(16).upcase - result << (HANGUL_CONSTANTS[:VBase] + v_index).to_s(16).upcase - result << (HANGUL_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0 - - result - end - - # Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0. - def reorder(code_points) - code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] } - - result = [] - accum = [] - - code_points_with_cc.each do |cp_with_cc| - if cp_with_cc[1] == 0 - unless accum.empty? - result.concat(stable_sort(accum)) - accum = [] - end - result << cp_with_cc - else - accum << cp_with_cc - end - end - - result.concat(stable_sort(accum)) unless accum.empty? - - result.map { |cp_with_cc| cp_with_cc[0] } - end - - def stable_sort(code_points_with_cc) - n = code_points_with_cc.size - 2 - - code_points_with_cc.size.times do - swapped = false - - (0..n).each do |j| - if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1] - code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j] - swapped = true - end - end - - break unless swapped - n -= 1 - end - - code_points_with_cc - end - - def combining_class_for(code_point) - TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i - rescue NoMethodError - 0 - end - - end - - end - end -end \ No newline at end of file diff --git a/lib/twitter_cldr/normalizers/nfd.rb b/lib/twitter_cldr/normalizers/nfd.rb new file mode 100644 index 000000000..94ca78e08 --- /dev/null +++ b/lib/twitter_cldr/normalizers/nfd.rb @@ -0,0 +1,30 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +module TwitterCldr + module Normalizers + + # Implements normalization of a Unicode string to Normalization Form D (NFD). + # This normalization includes only Canonical Decomposition. + # + class NFD < NFKD + + class << self + + protected + + # Returns code point's Decomposition Mapping based on its Unicode data. Returns nil if the mapping has + # compatibility type (it contains compatibility formatting tag). + # + def decomposition_mapping(unicode_data) + mapping = parse_decomposition_mapping(unicode_data) + mapping unless compatibility_decomposition?(mapping) + end + + end + + end + end +end \ No newline at end of file diff --git a/lib/twitter_cldr/normalizers/nfkd.rb b/lib/twitter_cldr/normalizers/nfkd.rb new file mode 100644 index 000000000..399128595 --- /dev/null +++ b/lib/twitter_cldr/normalizers/nfkd.rb @@ -0,0 +1,163 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +module TwitterCldr + # Normalizers module includes algorithm for Unicode normalization. Basic information on this topic can be found in the + # Unicode Standard Annex #15 "Unicode Normalization Forms" at http://www.unicode.org/reports/tr15/. More detailed + # description is given in the section "3.11 Normalization Forms" of the Unicode Standard core specification. The + # latest version at the moment (for Unicode 6.1) is available at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf. + # + module Normalizers + class NFKD + + class << self + + def normalize(string) + code_points = TwitterCldr::Utils::CodePoints.from_string(string) + normalized_code_points = normalize_code_points(code_points) + TwitterCldr::Utils::CodePoints.to_string(normalized_code_points) + end + + def normalize_code_points(code_points) + canonical_ordering(decomposition(code_points)) + end + + protected + + def decomposition(code_points) + code_points.map{ |code_point| decompose_recursively(code_point) }.flatten + end + + # Recursively decomposes a given code point with the values in its Decomposition Mapping property. + # + def decompose_recursively(code_point) + unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point) + return code_point unless unicode_data + + if unicode_data.name.include?('Hangul') + decompose_hangul(code_point) + else + decompose_regular(code_point, decomposition_mapping(unicode_data)) + end + end + + # Decomposes regular (non-Hangul) code point. + # + def decompose_regular(code_point, mapping) + if mapping && !mapping.empty? + mapping.map{ |cp| decompose_recursively(cp) }.flatten + else + code_point + end + end + + # Returns code point's Decomposition Mapping based on its Unicode data. + # + def decomposition_mapping(unicode_data) + mapping = parse_decomposition_mapping(unicode_data) + mapping.shift if compatibility_decomposition?(mapping) # remove compatibility formatting tag + mapping + end + + def compatibility_decomposition?(mapping) + !!(COMPATIBILITY_FORMATTING_TAG_REGEXP =~ mapping.first) + end + + def parse_decomposition_mapping(unicode_data) + unicode_data.decomposition.split + end + + # Special decomposition for Hangul syllables. Documented in Section 3.12 at + # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf + # + def decompose_hangul(code_point) + s_index = code_point.hex - HANGUL_DECOMPOSITION_CONSTANTS[:SBase] + + l_index = s_index / HANGUL_DECOMPOSITION_CONSTANTS[:NCount] + v_index = (s_index % HANGUL_DECOMPOSITION_CONSTANTS[:NCount]) / HANGUL_DECOMPOSITION_CONSTANTS[:TCount] + t_index = s_index % HANGUL_DECOMPOSITION_CONSTANTS[:TCount] + + result = [] + + result << (HANGUL_DECOMPOSITION_CONSTANTS[:LBase] + l_index).to_s(16).upcase + result << (HANGUL_DECOMPOSITION_CONSTANTS[:VBase] + v_index).to_s(16).upcase + result << (HANGUL_DECOMPOSITION_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0 + + result + end + + # Performs the Canonical Ordering Algorithm by stable sorting of every subsequence of combining code points + # (code points that have combining class greater than zero). + # + def canonical_ordering(code_points) + code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] } + + result = [] + accum = [] + + code_points_with_cc.each do |cp_with_cc| + if cp_with_cc[1] == 0 + unless accum.empty? + result.concat(stable_sort(accum)) + accum = [] + end + result << cp_with_cc + else + accum << cp_with_cc + end + end + + result.concat(stable_sort(accum)) unless accum.empty? + + result.map { |cp_with_cc| cp_with_cc[0] } + end + + # Performs stable sorting of a sequence of [code_point, combining_class] pairs. + # + def stable_sort(code_points_with_cc) + n = code_points_with_cc.size - 2 + + code_points_with_cc.size.times do + swapped = false + + (0..n).each do |j| + if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1] + code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j] + swapped = true + end + end + + break unless swapped + n -= 1 + end + + code_points_with_cc + end + + def combining_class_for(code_point) + TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i + rescue NoMethodError + 0 + end + + end + + COMPATIBILITY_FORMATTING_TAG_REGEXP = /^<.*>$/ + + HANGUL_DECOMPOSITION_CONSTANTS = { + :SBase => 0xAC00, + :LBase => 0x1100, + :VBase => 0x1161, + :TBase => 0x11A7, + :LCount => 19, + :VCount => 21, + :TCount => 28, + :NCount => 588, # VCount * TCount + :Scount => 11172 # LCount * NCount + } + + end + end +end diff --git a/lib/twitter_cldr/utils.rb b/lib/twitter_cldr/utils.rb index f6115d6c8..f38210d59 100644 --- a/lib/twitter_cldr/utils.rb +++ b/lib/twitter_cldr/utils.rb @@ -3,11 +3,11 @@ # Copyright 2012 Twitter, Inc # http://www.apache.org/licenses/LICENSE-2.0 -require 'twitter_cldr/utils/interpolation' - module TwitterCldr module Utils + autoload :CodePoints, 'twitter_cldr/utils/code_points' + class << self # adapted from: http://snippets.dzone.com/posts/show/11121 (first comment) @@ -25,4 +25,6 @@ def deep_symbolize_keys(arg) end end -end \ No newline at end of file +end + +require 'twitter_cldr/utils/interpolation' \ No newline at end of file diff --git a/lib/twitter_cldr/utils/code_points.rb b/lib/twitter_cldr/utils/code_points.rb new file mode 100644 index 000000000..185498f2e --- /dev/null +++ b/lib/twitter_cldr/utils/code_points.rb @@ -0,0 +1,40 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +module TwitterCldr + module Utils + module CodePoints + + class << self + + def to_char(code_point) + [code_point.upcase.hex].pack('U*') + end + + def from_char(char) + char.unpack('U*').first.to_s(16).upcase.rjust(4, '0') + end + + def from_chars(chars) + chars.map { |char| from_char(char) } + end + + def to_chars(code_points) + code_points.map { |code_point| to_char(code_point) } + end + + def from_string(str) + from_chars(str.chars.to_a) + end + + def to_string(code_points) + code_points.map{ |code_point| to_char(code_point) }.join + end + + end + + end + end +end \ No newline at end of file diff --git a/spec/normalizers/base_spec.rb b/spec/normalizers/base_spec.rb deleted file mode 100644 index a38e03add..000000000 --- a/spec/normalizers/base_spec.rb +++ /dev/null @@ -1,60 +0,0 @@ -# encoding: UTF-8 - -# Copyright 2012 Twitter, Inc -# http://www.apache.org/licenses/LICENSE-2.0 - -require 'spec_helper' - -describe TwitterCldr::Normalizers::Base do - describe "#code_point_to_char" do - it "converts unicode code points to the actual character" do - TwitterCldr::Normalizers::Base.code_point_to_char("221E").should == "∞" - end - end - - describe "#char_to_code_point" do - it "converts a character to a unicode code point" do - TwitterCldr::Normalizers::Base.char_to_code_point("∞").should == "221E" - end - end - - describe "#chars_to_code_points" do - it "should handle an empty array" do - TwitterCldr::Normalizers::Base.chars_to_code_points([]).should == [] - end - - it "converts a char array to an array of unicode code points" do - TwitterCldr::Normalizers::Base.chars_to_code_points(["e", "s", "p"]).should == ["0065", "0073", "0070"] - end - end - - describe "#code_points_to_chars" do - it "should handle an empty array" do - TwitterCldr::Normalizers::Base.code_points_to_chars([]).should == [] - end - - it "converts an array of unicode code points to an array of chars" do - TwitterCldr::Normalizers::Base.code_points_to_chars(["0065", "0073", "0070"]).should == ["e", "s", "p"] - end - end - - describe "#string_to_code_points" do - it "should handle an empty string" do - TwitterCldr::Normalizers::Base.string_to_code_points("").should == [] - end - - it "converts a string into an array of unicode code points" do - TwitterCldr::Normalizers::Base.string_to_code_points("español").should == ["0065", "0073", "0070", "0061", "00F1", "006F", "006C"] - end - end - - describe "#code_points_to_string" do - it "should handle an empty array" do - TwitterCldr::Normalizers::Base.code_points_to_string([]).should == "" - end - - it "converts an array of unicode code points to a string" do - TwitterCldr::Normalizers::Base.code_points_to_string(["0065", "0073", "0070", "0061", "00F1", "006F", "006C"]).should == "español" - end - end -end \ No newline at end of file diff --git a/spec/normalizers/canonical/nfd_spec.rb b/spec/normalizers/canonical/nfd_spec.rb deleted file mode 100644 index 81f38e31b..000000000 --- a/spec/normalizers/canonical/nfd_spec.rb +++ /dev/null @@ -1,79 +0,0 @@ -# encoding: UTF-8 - -# Copyright 2012 Twitter, Inc -# http://www.apache.org/licenses/LICENSE-2.0 - -require 'spec_helper' - -require 'open-uri' - -include TwitterCldr::Normalizers - -describe NFD do - - NORMALIZERS_SPEC_PATH = File.dirname(File.dirname(__FILE__)) - - NORMALIZATION_TEST_URL = 'http://unicode.org/Public/UNIDATA/NormalizationTest.txt' - - describe "#normalize" do - NFD.normalize("庠摪饢鼢豦樄澸脧鱵礩翜艰").should == "庠摪饢鼢豦樄澸脧鱵礩翜艰" - NFD.normalize("䷙䷿").should == "䷙䷿" - NFD.normalize("ᎿᎲᎪᏨᎨᏪᎧᎵᏥ").should == "ᎿᎲᎪᏨᎨᏪᎧᎵᏥ" - NFD.normalize("ᆙᅓᆼᄋᇶ").should == "ᆙᅓᆼᄋᇶ" - NFD.normalize("…‾⁋
⁒‒′‾⁖").should == "…‾⁋
⁒‒′‾⁖" - NFD.normalize("ⶾⷕⶱⷀ").should == "ⶾⷕⶱⷀ" - end - - describe "#decompose" do - it "does not decompose a character with no decomposition mapping" do - code_points = %w[0EB8 041F 0066 1F52C A2D6] - code_points.each do |code_point| - NFD.decompose(code_point).should == code_point - end - end - - it "does not decompose a character with compatibility decomposition mapping" do - code_points = %w[A770 FB02 FC35 FD20 00BC] - code_points.each do |code_point| - NFD.decompose(code_point).should == code_point - end - end - end - - describe "#normalize_code_points" do - it "passes all the tests in NormalizersTestShort.txt" do - open(File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTestShort.txt'), 'r:UTF-8') do |file| - run_normalization_test(file) - end - end - - it "passes all the tests in NormalizersTest.txt", :slow => true do - file_path = File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTest.txt') - - unless File.file?(file_path) - print ' Downloading NormalizationTest.txt ... ' - open(file_path, 'w') { |file| file.write(open(NORMALIZATION_TEST_URL).read) } - puts 'done.' - end - - open(file_path, 'r:UTF-8') do |file| - run_normalization_test(file) - end - end - end - - def run_normalization_test(file) - file.each do |line| - next if line[0,1] =~ /(@|#)/ || line.empty? - - c1, c2, c3, c4, c5 = line.split(';')[0...5].map { |cps| cps.split } - - NFD.normalize_code_points(c1).should == c3 - NFD.normalize_code_points(c2).should == c3 - NFD.normalize_code_points(c3).should == c3 - NFD.normalize_code_points(c4).should == c5 - NFD.normalize_code_points(c5).should == c5 - end - end - -end \ No newline at end of file diff --git a/spec/normalizers/nfd_spec.rb b/spec/normalizers/nfd_spec.rb new file mode 100644 index 000000000..3d00a9c19 --- /dev/null +++ b/spec/normalizers/nfd_spec.rb @@ -0,0 +1,21 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +require 'spec_helper' + +include TwitterCldr::Normalizers + +describe NFD do + + describe "#normalize" do + NFD.normalize("庠摪饢鼢豦樄澸脧鱵礩翜艰").should == "庠摪饢鼢豦樄澸脧鱵礩翜艰" + NFD.normalize("䷙䷿").should == "䷙䷿" + NFD.normalize("ᎿᎲᎪᏨᎨᏪᎧᎵᏥ").should == "ᎿᎲᎪᏨᎨᏪᎧᎵᏥ" + NFD.normalize("ᆙᅓᆼᄋᇶ").should == "ᆙᅓᆼᄋᇶ" + NFD.normalize("…‾⁋
⁒‒′‾⁖").should == "…‾⁋
⁒‒′‾⁖" + NFD.normalize("ⶾⷕⶱⷀ").should == "ⶾⷕⶱⷀ" + end + +end \ No newline at end of file diff --git a/spec/normalizers/normalization_spec.rb b/spec/normalizers/normalization_spec.rb new file mode 100644 index 000000000..c605304dd --- /dev/null +++ b/spec/normalizers/normalization_spec.rb @@ -0,0 +1,96 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +require 'spec_helper' + +require 'open-uri' + +include TwitterCldr::Normalizers + +describe 'Unicode Normalization Algorithms' do + + NORMALIZERS_SPEC_PATH = File.dirname(__FILE__) + SHORT_TEST_PATH = File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTestShort.txt') + FULL_TEST_PATH = File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTest.txt') + + NORMALIZATION_TEST_URL = 'http://unicode.org/Public/UNIDATA/NormalizationTest.txt' + + shared_examples_for 'a normalization algorithm' do + it 'passes all the tests in NormalizersTestShort.txt' do + run_normalization_test(described_class, invariants, SHORT_TEST_PATH) + end + + it 'passes all the tests in NormalizersTest.txt', :slow => true do + prepare_full_test + run_normalization_test(described_class, invariants, FULL_TEST_PATH) + end + end + + describe NFD do + let(:invariants) { { 3 => [1, 2, 3], 5 => [4, 5] } } + it_behaves_like 'a normalization algorithm' + end + + describe NFKD do + let(:invariants) { { 5 => [1, 2, 3, 4, 5] } } + it_behaves_like 'a normalization algorithm' + end + + # Runs standard Unicode normalization tests from `file_path` for a given `normalizer`. Expected invariants are + # specified via `invariants` hash. + # + # E.g., if `invariants` is { 2 => [1, 2, 3], 4 => [4, 5] } than the following invariants are expected to be true: + # + # c2 == normalized(c1) == normalized(c2) == normalized(c3) + # c4 == normalized(c4) == normalized(c5) + # + # where (c1, c2,...) are columns of the normalization test separated by semicolons and normalized() is the + # normalization function. Note, how expectation and tests columns indexes match the numbers in the `invariants` hash. + # + def run_normalization_test(normalizer, invariants, file_path) + open(file_path, 'r:UTF-8') do |file| + file.each do |line| + next if line.empty? || line =~ /^(@|#)/ + + data = line.split(';')[0...5].map { |cps| cps.split } + + invariants.each do |expected_index, tests| + expected = data[expected_index - 1] + + tests.each do |test_index| + test = data[test_index - 1] + + normalized = normalizer.normalize_code_points(test) + + message = normalization_error_message(line, test, expected, normalized, test_index, expected_index) + normalized.should(eq(expected), message) + end + end + end + end + end + + # Generates helpful error message for normalization test failure. + # + def normalization_error_message(line, test, expected, normalized, test_index, expected_index) + <<-END +Test: "#{line.strip}" +Invariant: normalized(c#{test_index}) == c#{expected_index} +Expected: normalized(#{test.inspect}) == #{expected.inspect} +Got: #{normalized.inspect} + END + end + + # Downloads full Unicode normalization tests suit if necessary. + # + def prepare_full_test + return if File.file?(FULL_TEST_PATH) + + print ' Downloading NormalizationTest.txt ... ' + open(FULL_TEST_PATH, 'w') { |file| file.write(open(NORMALIZATION_TEST_URL).read) } + puts 'done.' + end + +end \ No newline at end of file diff --git a/spec/utils/code_point_spec.rb b/spec/utils/code_point_spec.rb new file mode 100644 index 000000000..b4f63d087 --- /dev/null +++ b/spec/utils/code_point_spec.rb @@ -0,0 +1,62 @@ +# encoding: UTF-8 + +# Copyright 2012 Twitter, Inc +# http://www.apache.org/licenses/LICENSE-2.0 + +require 'spec_helper' + +describe TwitterCldr::Utils::CodePoints do + + describe '#to_char' do + it 'converts unicode code points to the actual character' do + TwitterCldr::Utils::CodePoints.to_char('221E').should == '∞' + end + end + + describe '#from_char' do + it 'converts a character to a unicode code point' do + TwitterCldr::Utils::CodePoints.from_char('∞').should == '221E' + end + end + + describe '#to_chars' do + it 'should handle an empty array' do + TwitterCldr::Utils::CodePoints.to_chars([]).should == [] + end + + it 'converts an array of unicode code points to an array of chars' do + TwitterCldr::Utils::CodePoints.to_chars(%w[0065 0073 0070]).should == %w[e s p] + end + end + + describe '#from_chars' do + it 'should handle an empty array' do + TwitterCldr::Utils::CodePoints.from_chars([]).should == [] + end + + it 'converts an array of chars to an array of unicode code points' do + TwitterCldr::Utils::CodePoints.from_chars(%w[e s p]).should == %w[0065 0073 0070] + end + end + + describe '#to_string' do + it 'should handle an empty array' do + TwitterCldr::Utils::CodePoints.to_string([]).should == '' + end + + it 'converts an array of unicode code points to a string' do + TwitterCldr::Utils::CodePoints.to_string(%w[0065 0073 0070 0061 00F1 006F 006C]).should == 'español' + end + end + + describe '#from_string' do + it 'should handle an empty string' do + TwitterCldr::Utils::CodePoints.from_string('').should == [] + end + + it 'converts a string into an array of unicode code points' do + TwitterCldr::Utils::CodePoints.from_string('español').should == %w[0065 0073 0070 0061 00F1 006F 006C] + end + end + +end \ No newline at end of file