-
-
Notifications
You must be signed in to change notification settings - Fork 409
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for transliteration to ASCII.
- Loading branch information
Showing
8 changed files
with
271 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
# encoding: utf-8 | ||
module I18n | ||
module Backend | ||
module Transliterator | ||
|
||
DEFAULT_REPLACEMENT_CHAR = "?" | ||
|
||
# Get a transliterator instance. | ||
def self.get(rule = nil) | ||
if !rule || rule.kind_of?(Hash) | ||
HashTransliterator.new(rule) | ||
elsif rule.kind_of? Proc | ||
ProcTransliterator.new(rule) | ||
else | ||
raise I18n::ArgumentError, "Transliteration rule must be a proc or a hash." | ||
end | ||
end | ||
|
||
# A transliterator which accepts a Proc as its transliteration rule. | ||
class ProcTransliterator | ||
|
||
def initialize(rule) | ||
@rule = rule | ||
end | ||
|
||
def transliterate(string, replacement = nil) | ||
@rule.call(string) | ||
end | ||
|
||
end | ||
|
||
# A transliterator which accepts a Hash of characters as its translation | ||
# rule. | ||
class HashTransliterator | ||
|
||
DEFAULT_APPROXIMATIONS = { | ||
"À"=>"A", "Á"=>"A", "Â"=>"A", "Ã"=>"A", "Ä"=>"A", "Å"=>"A", "Æ"=>"AE", | ||
"Ç"=>"C", "È"=>"E", "É"=>"E", "Ê"=>"E", "Ë"=>"E", "Ì"=>"I", "Í"=>"I", | ||
"Î"=>"I", "Ï"=>"I", "Ð"=>"D", "Ñ"=>"N", "Ò"=>"O", "Ó"=>"O", "Ô"=>"O", | ||
"Õ"=>"O", "Ö"=>"O", "×"=>"x", "Ø"=>"O", "Ù"=>"U", "Ú"=>"U", "Û"=>"U", | ||
"Ü"=>"U", "Ý"=>"Y", "Þ"=>"Th", "ß"=>"ss", "à"=>"a", "á"=>"a", "â"=>"a", | ||
"ã"=>"a", "ä"=>"a", "å"=>"a", "æ"=>"ae", "ç"=>"c", "è"=>"e", "é"=>"e", | ||
"ê"=>"e", "ë"=>"e", "ì"=>"i", "í"=>"i", "î"=>"i", "ï"=>"i", "ð"=>"d", | ||
"ñ"=>"n", "ò"=>"o", "ó"=>"o", "ô"=>"o", "õ"=>"o", "ö"=>"o", "ø"=>"o", | ||
"ù"=>"u", "ú"=>"u", "û"=>"u", "ü"=>"u", "ý"=>"y", "þ"=>"th", "ÿ"=>"y", | ||
"Ā"=>"A", "ā"=>"a", "Ă"=>"A", "ă"=>"a", "Ą"=>"A", "ą"=>"a", "Ć"=>"C", | ||
"ć"=>"c", "Ĉ"=>"C", "ĉ"=>"c", "Ċ"=>"C", "ċ"=>"c", "Č"=>"C", "č"=>"c", | ||
"Ď"=>"D", "ď"=>"d", "Đ"=>"D", "đ"=>"d", "Ē"=>"E", "ē"=>"e", "Ĕ"=>"E", | ||
"ĕ"=>"e", "Ė"=>"E", "ė"=>"e", "Ę"=>"E", "ę"=>"e", "Ě"=>"E", "ě"=>"e", | ||
"Ĝ"=>"G", "ĝ"=>"g", "Ğ"=>"G", "ğ"=>"g", "Ġ"=>"G", "ġ"=>"g", "Ģ"=>"G", | ||
"ģ"=>"g", "Ĥ"=>"H", "ĥ"=>"h", "Ħ"=>"H", "ħ"=>"h", "Ĩ"=>"I", "ĩ"=>"i", | ||
"Ī"=>"I", "ī"=>"i", "Ĭ"=>"I", "ĭ"=>"i", "Į"=>"I", "į"=>"i", "İ"=>"I", | ||
"ı"=>"i", "IJ"=>"IJ", "ij"=>"ij", "Ĵ"=>"J", "ĵ"=>"j", "Ķ"=>"K", "ķ"=>"k", | ||
"ĸ"=>"k", "Ĺ"=>"L", "ĺ"=>"l", "Ļ"=>"L", "ļ"=>"l", "Ľ"=>"L", "ľ"=>"l", | ||
"Ŀ"=>"L", "ŀ"=>"l", "Ł"=>"L", "ł"=>"l", "Ń"=>"N", "ń"=>"n", "Ņ"=>"N", | ||
"ņ"=>"n", "Ň"=>"N", "ň"=>"n", "ʼn"=>"'n", "Ŋ"=>"NG", "ŋ"=>"ng", | ||
"Ō"=>"O", "ō"=>"o", "Ŏ"=>"O", "ŏ"=>"o", "Ő"=>"O", "ő"=>"o", "Œ"=>"OE", | ||
"œ"=>"oe", "Ŕ"=>"R", "ŕ"=>"r", "Ŗ"=>"R", "ŗ"=>"r", "Ř"=>"R", "ř"=>"r", | ||
"Ś"=>"S", "ś"=>"s", "Ŝ"=>"S", "ŝ"=>"s", "Ş"=>"S", "ş"=>"s", "Š"=>"S", | ||
"š"=>"s", "Ţ"=>"T", "ţ"=>"t", "Ť"=>"T", "ť"=>"t", "Ŧ"=>"T", "ŧ"=>"t", | ||
"Ũ"=>"U", "ũ"=>"u", "Ū"=>"U", "ū"=>"u", "Ŭ"=>"U", "ŭ"=>"u", "Ů"=>"U", | ||
"ů"=>"u", "Ű"=>"U", "ű"=>"u", "Ų"=>"U", "ų"=>"u", "Ŵ"=>"W", "ŵ"=>"w", | ||
"Ŷ"=>"Y", "ŷ"=>"y", "Ÿ"=>"Y", "Ź"=>"Z", "ź"=>"z", "Ż"=>"Z", "ż"=>"z", | ||
"Ž"=>"Z", "ž"=>"z" | ||
} | ||
|
||
def initialize(rule = nil) | ||
@rule = rule | ||
add DEFAULT_APPROXIMATIONS | ||
add rule if rule | ||
end | ||
|
||
def transliterate(string, replacement = nil) | ||
string.gsub(/[^\x00-\x7f]/u) do |char| | ||
approximations[char] || replacement || DEFAULT_REPLACEMENT_CHAR | ||
end | ||
end | ||
|
||
private | ||
|
||
def approximations | ||
@approximations ||= {} | ||
end | ||
|
||
# Add transliteration rules to the approximations hash. | ||
def add(hash) | ||
hash.keys.each {|key| hash[key.to_s] = hash.delete(key).to_s} | ||
approximations.merge! hash | ||
end | ||
|
||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,3 @@ | ||
module I18n | ||
VERSION = "0.3.7" | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# encoding: utf-8 | ||
$:.unshift(File.expand_path(File.dirname(__FILE__) + '/../')); $:.uniq! | ||
require 'test_helper' | ||
|
||
class I18nBackendTransliterator < Test::Unit::TestCase | ||
|
||
class Backend | ||
include I18n::Backend::Base | ||
end | ||
|
||
def setup | ||
I18n.backend = Backend.new | ||
@proc = lambda { |n| n.upcase } | ||
@hash = { :"ü" => "ue", :"ö" => "oe" } | ||
@transliterator = I18n::Backend::Transliterator.get | ||
end | ||
|
||
test "transliteration rule can be a proc" do | ||
store_translations(:xx, :i18n => {:transliterate => {:rule => @proc}}) | ||
assert_equal "HELLO", I18n.backend.transliterate(:xx, "hello") | ||
end | ||
|
||
test "transliteration rule can be a hash" do | ||
store_translations(:xx, :i18n => {:transliterate => {:rule => @hash}}) | ||
assert_equal "ue", I18n.backend.transliterate(:xx, "ü") | ||
end | ||
|
||
test "transliteration rule must be a proc or hash" do | ||
store_translations(:xx, :i18n => {:transliterate => {:rule => ""}}) | ||
assert_raise I18n::ArgumentError do | ||
I18n.backend.transliterate(:xx, "ü") | ||
end | ||
end | ||
|
||
test "transliterator defaults to latin => ascii when no rule is given" do | ||
assert_equal "AEroskobing", I18n.backend.transliterate(:xx, "Ærøskøbing") | ||
end | ||
|
||
test "default transliterator should not modify ascii characters" do | ||
(0..127).each do |byte| | ||
char = [byte].pack("U") | ||
assert_equal char, @transliterator.transliterate(char) | ||
end | ||
end | ||
|
||
test "default transliterator correctly transliterates latin characters" do | ||
# create string with range of Unicode's western characters with | ||
# diacritics, excluding the division and multiplication signs which for | ||
# some reason or other are floating in the middle of all the letters. | ||
string = (0xC0..0x17E).to_a.reject {|c| [0xD7, 0xF7].include? c}.pack("U*") | ||
string.split(//) do |char| | ||
assert_match %r{^[a-zA-Z']*$}, @transliterator.transliterate(string) | ||
end | ||
end | ||
|
||
test "should replace non-ASCII chars not in map with a replacement char" do | ||
assert_equal "abc?", @transliterator.transliterate("abcſ") | ||
end | ||
|
||
test "can replace non-ASCII chars not in map with a custom replacement string" do | ||
assert_equal "abc#", @transliterator.transliterate("abcſ", "#") | ||
end | ||
|
||
if RUBY_VERSION >= "1.9" | ||
test "default transliterator raises errors for invalid UTF-8" do | ||
assert_raise ArgumentError do | ||
@transliterator.transliterate("a\x92b") | ||
end | ||
end | ||
end | ||
|
||
test "I18n.transliterate should transliterate using a default transliterator" do | ||
assert_equal "aeo", I18n.transliterate("áèö") | ||
end | ||
|
||
test "I18n.transliterate should transliterate using a locale" do | ||
store_translations(:xx, :i18n => {:transliterate => {:rule => @hash}}) | ||
assert_equal "ue", I18n.transliterate("ü", :locale => :xx) | ||
end | ||
|
||
test "default transliterator fails with custom rules with uncomposed input" do | ||
char = [117, 776].pack("U*") # "ü" as ASCII "u" plus COMBINING DIAERESIS | ||
transliterator = I18n::Backend::Transliterator.get(@hash) | ||
assert_not_equal "ue", transliterator.transliterate(char) | ||
end | ||
|
||
end |