NFKD implementation #32

Merged
merged 11 commits into from May 21, 2012
@@ -32,7 +32,7 @@ def normalize
end
def code_points
- TwitterCldr::Normalizers::Base.string_to_code_points(@base_obj)
+ TwitterCldr::Utils::CodePoints.from_string(@base_obj)
end
def to_s
@@ -5,7 +5,7 @@
module TwitterCldr
module Normalizers
- autoload :Base, 'twitter_cldr/normalizers/base'
- autoload :NFD, 'twitter_cldr/normalizers/canonical/nfd'
+ autoload :NFD, 'twitter_cldr/normalizers/nfd'
+ autoload :NFKD, 'twitter_cldr/normalizers/nfkd'
end
end
@@ -1,37 +0,0 @@
-# encoding: UTF-8
-
-# Copyright 2012 Twitter, Inc
-# http://www.apache.org/licenses/LICENSE-2.0
-
-module TwitterCldr
- module Normalizers
- class Base
- class << self
- def code_point_to_char(code_point)
- [code_point.upcase.hex].pack('U*')
- end
-
- def char_to_code_point(char)
- code_point = char.unpack('U*').first.to_s(16).upcase
- code_point.rjust(4, '0') #Pad to at least 4 digits
- end
-
- def chars_to_code_points(chars)
- chars.map { |char| char_to_code_point(char) }
- end
-
- def code_points_to_chars(code_points)
- code_points.map { |code_point| code_point_to_char(code_point) }
- end
-
- def string_to_code_points(str)
- chars_to_code_points(str.chars.to_a)
- end
-
- def code_points_to_string(code_points)
- code_points.inject("") { |str, code_point| str << code_point_to_char(code_point); str }
- end
- end
- end
- end
-end
@@ -1,133 +0,0 @@
-# encoding: UTF-8
-
-# Copyright 2012 Twitter, Inc
-# http://www.apache.org/licenses/LICENSE-2.0
-
-module TwitterCldr
- module Normalizers
- class NFD < Base
-
- HANGUL_CONSTANTS = {
- :SBase => 0xAC00,
- :LBase => 0x1100,
- :VBase => 0x1161,
- :TBase => 0x11A7,
- :LCount => 19,
- :VCount => 21,
- :TCount => 28,
- :NCount => 588, # VCount * TCount
- :Scount => 11172 # LCount * NCount
- }
-
- class << self
-
- def normalize(string)
- # Convert string to code points
- code_points = string.split('').map { |char| char_to_code_point(char) }
-
- # Normalize code points
- normalized_code_points = normalize_code_points(code_points)
-
- # Convert normalized code points back to string
- normalized_code_points.map { |code_point| code_point_to_char(code_point) }.join
- end
-
- def normalize_code_points(code_points)
- code_points = code_points.map { |code_point| decompose code_point }.flatten
- reorder(code_points)
- end
-
- # Recursively replace the given code point with the values in its Decomposition_Mapping property.
- def decompose(code_point)
- unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point)
- return code_point unless unicode_data
-
- decomposition_mapping = unicode_data.decomposition.split
-
- if unicode_data.name.include?('Hangul')
- decompose_hangul(code_point)
- # Return the code point if compatibility mapping or if no mapping exists
- elsif decomposition_mapping.first =~ /<.*>/ || decomposition_mapping.empty?
- code_point
- else
- decomposition_mapping.map do |decomposition_code_point|
- decompose(decomposition_code_point)
- end.flatten
- end
- end
-
- private
-
- # Special decomposition for Hangul syllables.
- # Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
- def decompose_hangul(code_point)
- s_index = code_point.hex - HANGUL_CONSTANTS[:SBase]
-
- l_index = s_index / HANGUL_CONSTANTS[:NCount]
- v_index = (s_index % HANGUL_CONSTANTS[:NCount]) / HANGUL_CONSTANTS[:TCount]
- t_index = s_index % HANGUL_CONSTANTS[:TCount]
-
- result = []
-
- result << (HANGUL_CONSTANTS[:LBase] + l_index).to_s(16).upcase
- result << (HANGUL_CONSTANTS[:VBase] + v_index).to_s(16).upcase
- result << (HANGUL_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0
-
- result
- end
-
- # Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0.
- def reorder(code_points)
- code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] }
-
- result = []
- accum = []
-
- code_points_with_cc.each do |cp_with_cc|
- if cp_with_cc[1] == 0
- unless accum.empty?
- result.concat(stable_sort(accum))
- accum = []
- end
- result << cp_with_cc
- else
- accum << cp_with_cc
- end
- end
-
- result.concat(stable_sort(accum)) unless accum.empty?
-
- result.map { |cp_with_cc| cp_with_cc[0] }
- end
-
- def stable_sort(code_points_with_cc)
- n = code_points_with_cc.size - 2
-
- code_points_with_cc.size.times do
- swapped = false
-
- (0..n).each do |j|
- if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1]
- code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j]
- swapped = true
- end
- end
-
- break unless swapped
- n -= 1
- end
-
- code_points_with_cc
- end
-
- def combining_class_for(code_point)
- TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i
- rescue NoMethodError
- 0
- end
-
- end
-
- end
- end
-end
@@ -0,0 +1,30 @@
+# encoding: UTF-8
+
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+
+module TwitterCldr
+ module Normalizers
+
+ # Implements normalization of a Unicode string to Normalization Form D (NFD).
+ # This normalization includes only Canonical Decomposition.
+ #
+ class NFD < NFKD
+
+ class << self
+
+ protected
+
+ # Returns code point's Decomposition Mapping based on its Unicode data. Returns nil if the mapping has
+ # compatibility type (it contains compatibility formatting tag).
+ #
+ def decomposition_mapping(unicode_data)

This comment has been minimized.

Show comment Hide comment
@camertron

camertron May 21, 2012

Collaborator

Ah ok. Just so I understand completely: this method does not handle compatibility mapping as NFKD does, so it simply returns nil if a compatibility mapping is required.

@camertron

camertron May 21, 2012

Collaborator

Ah ok. Just so I understand completely: this method does not handle compatibility mapping as NFKD does, so it simply returns nil if a compatibility mapping is required.

This comment has been minimized.

Show comment Hide comment
@KL-7

KL-7 May 21, 2012

Contributor

Yes, when NFD comes across compatibility mapping it skips it, while NFKD removes compatibility tag and goes further.

@KL-7

KL-7 May 21, 2012

Contributor

Yes, when NFD comes across compatibility mapping it skips it, while NFKD removes compatibility tag and goes further.

+ mapping = parse_decomposition_mapping(unicode_data)
+ mapping unless compatibility_decomposition?(mapping)
+ end
+
+ end
+
+ end
+ end
+end
@@ -0,0 +1,163 @@
+# encoding: UTF-8
+
+# Copyright 2012 Twitter, Inc
+# http://www.apache.org/licenses/LICENSE-2.0
+
+module TwitterCldr
+ # Normalizers module includes algorithm for Unicode normalization. Basic information on this topic can be found in the
+ # Unicode Standard Annex #15 "Unicode Normalization Forms" at http://www.unicode.org/reports/tr15/. More detailed
+ # description is given in the section "3.11 Normalization Forms" of the Unicode Standard core specification. The
+ # latest version at the moment (for Unicode 6.1) is available at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf.
+ #
+ module Normalizers
+ class NFKD
+
+ class << self
+
+ def normalize(string)
+ code_points = TwitterCldr::Utils::CodePoints.from_string(string)
+ normalized_code_points = normalize_code_points(code_points)
+ TwitterCldr::Utils::CodePoints.to_string(normalized_code_points)
+ end
+
+ def normalize_code_points(code_points)
+ canonical_ordering(decomposition(code_points))
+ end
+
+ protected
+
+ def decomposition(code_points)
+ code_points.map{ |code_point| decompose_recursively(code_point) }.flatten
+ end
+
+ # Recursively decomposes a given code point with the values in its Decomposition Mapping property.
+ #
+ def decompose_recursively(code_point)
+ unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point)
+ return code_point unless unicode_data
+
+ if unicode_data.name.include?('Hangul')
+ decompose_hangul(code_point)
+ else
+ decompose_regular(code_point, decomposition_mapping(unicode_data))
+ end
+ end
+
+ # Decomposes regular (non-Hangul) code point.
+ #
+ def decompose_regular(code_point, mapping)
+ if mapping && !mapping.empty?
+ mapping.map{ |cp| decompose_recursively(cp) }.flatten
+ else
+ code_point
+ end
+ end
+
+ # Returns code point's Decomposition Mapping based on its Unicode data.
+ #
+ def decomposition_mapping(unicode_data)
+ mapping = parse_decomposition_mapping(unicode_data)
+ mapping.shift if compatibility_decomposition?(mapping) # remove compatibility formatting tag
+ mapping
+ end
+
+ def compatibility_decomposition?(mapping)
+ !!(COMPATIBILITY_FORMATTING_TAG_REGEXP =~ mapping.first)
+ end
+
+ def parse_decomposition_mapping(unicode_data)
+ unicode_data.decomposition.split
+ end
+
+ # Special decomposition for Hangul syllables. Documented in Section 3.12 at
+ # http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
+ #
+ def decompose_hangul(code_point)
+ s_index = code_point.hex - HANGUL_DECOMPOSITION_CONSTANTS[:SBase]
+
+ l_index = s_index / HANGUL_DECOMPOSITION_CONSTANTS[:NCount]
+ v_index = (s_index % HANGUL_DECOMPOSITION_CONSTANTS[:NCount]) / HANGUL_DECOMPOSITION_CONSTANTS[:TCount]
+ t_index = s_index % HANGUL_DECOMPOSITION_CONSTANTS[:TCount]
+
+ result = []
+
+ result << (HANGUL_DECOMPOSITION_CONSTANTS[:LBase] + l_index).to_s(16).upcase
+ result << (HANGUL_DECOMPOSITION_CONSTANTS[:VBase] + v_index).to_s(16).upcase
+ result << (HANGUL_DECOMPOSITION_CONSTANTS[:TBase] + t_index).to_s(16).upcase if t_index > 0
+
+ result
+ end
+
+ # Performs the Canonical Ordering Algorithm by stable sorting of every subsequence of combining code points
+ # (code points that have combining class greater than zero).
+ #
+ def canonical_ordering(code_points)
+ code_points_with_cc = code_points.map { |cp| [cp, combining_class_for(cp)] }
+
+ result = []
+ accum = []
+
+ code_points_with_cc.each do |cp_with_cc|
+ if cp_with_cc[1] == 0
+ unless accum.empty?
+ result.concat(stable_sort(accum))
+ accum = []
+ end
+ result << cp_with_cc
+ else
+ accum << cp_with_cc
+ end
+ end
+
+ result.concat(stable_sort(accum)) unless accum.empty?
+
+ result.map { |cp_with_cc| cp_with_cc[0] }
+ end
+
+ # Performs stable sorting of a sequence of [code_point, combining_class] pairs.

This comment has been minimized.

Show comment Hide comment
@camertron

camertron May 21, 2012

Collaborator

Could you add a link to the document (if you or Andrew used one) that explains this algorithm

@camertron

camertron May 21, 2012

Collaborator

Could you add a link to the document (if you or Andrew used one) that explains this algorithm

This comment has been minimized.

Show comment Hide comment
@KL-7

KL-7 May 21, 2012

Contributor

It's nothing more than a regular bubble sort with a small optimization that stops the algorithm if nothing was swapped during the iteration.

@KL-7

KL-7 May 21, 2012

Contributor

It's nothing more than a regular bubble sort with a small optimization that stops the algorithm if nothing was swapped during the iteration.

This comment has been minimized.

Show comment Hide comment
@KL-7

KL-7 May 21, 2012

Contributor

Ah, @camertron, do you mean a document explaining NFD & NFKD algorithms themselves?

@KL-7

KL-7 May 21, 2012

Contributor

Ah, @camertron, do you mean a document explaining NFD & NFKD algorithms themselves?

This comment has been minimized.

Show comment Hide comment
@camertron

camertron May 21, 2012

Collaborator

It would be great to add a comment at the top of this file with a link to the NFKD spec, but I was referring to a description of the stable sort algorithm you used. If you mention it's just a bubble sort that maintains the order of equal elements and exits early if no swap was performed, that should be fine.

@camertron

camertron May 21, 2012

Collaborator

It would be great to add a comment at the top of this file with a link to the NFKD spec, but I was referring to a description of the stable sort algorithm you used. If you mention it's just a bubble sort that maintains the order of equal elements and exits early if no swap was performed, that should be fine.

+ #
+ def stable_sort(code_points_with_cc)
+ n = code_points_with_cc.size - 2
+
+ code_points_with_cc.size.times do
+ swapped = false
+
+ (0..n).each do |j|
+ if code_points_with_cc[j][1] > code_points_with_cc[j + 1][1]
+ code_points_with_cc[j], code_points_with_cc[j + 1] = code_points_with_cc[j + 1], code_points_with_cc[j]
+ swapped = true
+ end
+ end
+
+ break unless swapped
+ n -= 1
+ end
+
+ code_points_with_cc
+ end
+
+ def combining_class_for(code_point)
+ TwitterCldr::Shared::UnicodeData.for_code_point(code_point).combining_class.to_i
+ rescue NoMethodError
+ 0
+ end
+
+ end
+
+ COMPATIBILITY_FORMATTING_TAG_REGEXP = /^<.*>$/
+
+ HANGUL_DECOMPOSITION_CONSTANTS = {
+ :SBase => 0xAC00,
+ :LBase => 0x1100,
+ :VBase => 0x1161,
+ :TBase => 0x11A7,
+ :LCount => 19,
+ :VCount => 21,
+ :TCount => 28,
+ :NCount => 588, # VCount * TCount
+ :Scount => 11172 # LCount * NCount
+ }
+
+ end
+ end
+end
Oops, something went wrong.