twitter · camertron · Apr 27, 2012 · Apr 13, 2012 · Apr 13, 2012 · Apr 13, 2012
diff --git a/lib/normalizers/base.rb b/lib/normalizers/base.rb
@@ -0,0 +1,18 @@
+# encoding: UTF-8
+
+module TwitterCldr
+  module Normalizers
+    class Base
+      class << self
+        def code_point_to_char(code_point)
+          [code_point.upcase.hex].pack('U*')
+        end
+        def char_to_code_point(char)
+          code_point = char.unpack('U*').first.to_s(16).upcase
+          #Pad to atleast 4 digits
+          code_point.rjust(4, '0')
+        end
+      end
+    end
+  end
+end
diff --git a/lib/normalizers/canonical/nfd.rb b/lib/normalizers/canonical/nfd.rb
@@ -0,0 +1,81 @@
+# encoding: UTF-8
+
+module TwitterCldr
+  module Normalizers
+    class NFD < Base
+      @@hangul_constants = {:SBase => "AC00".hex, :LBase => "1100".hex, :VBase => "1161".hex, :TBase => "11A7".hex,
+                            :Scount => 11172, :LCount => 19, :VCount => 21, :TCount => 28, :NCount => 588, :Scount => 1172}
+      class << self
+        def normalize(string)       
+          #Convert string to code points
+          code_points = string.split('').map { |char| char_to_code_point(char) }
+
+          #Normalize code points
+          normalized_code_points = normalize_code_points(code_points)
+
+          #Convert normalized code points back to string
+          normalized_code_points.map { |code_point| code_point_to_char(code_point) }.join
+        end
+
+        def normalize_code_points(code_points)          
+          code_points = code_points.map { |code_point| decompose code_point }.flatten
+          reorder code_points
+          code_points
+        end
+
+        #Recursively replace the given code point with the values in its Decomposition_Mapping property
+        def decompose(code_point)
+          unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point)
+          return code_point unless unicode_data
+          decomposition_mapping = unicode_data[5].split
+
+          # Special decomposition for Hangul syllables.
+          # Documented in Section 3.12 at http://www.unicode.org/versions/Unicode6.1.0/ch03.pdf
+          if unicode_data[1].include? 'Hangul'
+            sIndex = code_point.hex - @@hangul_constants[:SBase]
+
+            lIndex = sIndex / @@hangul_constants[:NCount]
+            vIndex = (sIndex % @@hangul_constants[:NCount]) / @@hangul_constants[:TCount]
+            tIndex = sIndex % @@hangul_constants[:TCount]
+
+            lPart = (@@hangul_constants[:LBase] + lIndex).to_s(16).upcase
+            vPart = (@@hangul_constants[:VBase] + vIndex).to_s(16).upcase
+            tPart = (@@hangul_constants[:TBase] + tIndex).to_s(16).upcase if tIndex > 0
+
+            [lPart, vPart, tPart].compact
+
+          #Return the code point if compatibility mapping or if no mapping exists
+          elsif decomposition_mapping.first =~ /<.*>/ || decomposition_mapping.empty?
+            code_point
+          else
+            decomposition_mapping.map do |decomposition_code_point|
+              decompose(decomposition_code_point)
+            end.flatten
+          end
+        end
+
+        #Swap any two adjacent code points A & B if ccc(A) > ccc(B) > 0
+        def reorder(code_points)
+          (code_points.size).times do
+            code_points.each_with_index do |cp, i|
+              unless cp == code_points.last
+                ccc_a, ccc_b = combining_class_for(cp), combining_class_for(code_points[i+1])
+                if (ccc_a > ccc_b) && (ccc_b > 0)
+                  code_points[i], code_points[i+1] = code_points[i+1], code_points[i]
+                end
+              end
+            end
+          end
+        end
+
+        def combining_class_for(code_point)
+          begin
+            unicode_data = TwitterCldr::Shared::UnicodeData.for_code_point(code_point)[3].to_i
+          rescue NoMethodError
+            0
+          end          
+        end
+      end
+    end
+  end
+end
diff --git a/lib/shared/unicode_data.rb b/lib/shared/unicode_data.rb
@@ -12,8 +12,25 @@ def for_code_point(code_point)
             range.include? code_point.to_i(16)
           end 
 
-          TwitterCldr.get_resource("unicode_data", target.first)[code_point.to_sym] if target
-        end   
+          if target
+            block_data = TwitterCldr.get_resource("unicode_data", target.first)          
+            block_data.fetch(code_point.to_sym) { |code_point_sym| get_range_start(code_point_sym, block_data) }
+          end
+        end
+
+        private
+        # Check if block constitutes a range. The code point beginning a range will have a name enclosed in <>, ending with 'First'
+        # eg: <CJK Ideograph Extension A, First>
+        # http://unicode.org/reports/tr44/#Code_Point_Ranges
+        def get_range_start(code_point, block_data)
+          start_code_point = block_data.keys.sort_by { |key| key.to_s.to_i(16) }.first
+          start_data = block_data[start_code_point].clone
+          if start_data[1] =~ /<.*, First>/
+            start_data[0] = code_point.to_s
+            start_data[1] = start_data[1].sub(', First', '')
+            start_data
+          end
+        end
       end
     end
   end

diff --git a/lib/twitter_cldr.rb b/lib/twitter_cldr.rb
@@ -114,3 +114,7 @@ def self.supported_locale?(locale)
 require 'formatters/numbers/helpers/base'
 require 'formatters/numbers/helpers/fraction'
 require 'formatters/numbers/helpers/integer'
+
+# all normalizers
+require 'normalizers/base'
+require 'normalizers/canonical/nfd'