/
normalization_spec.rb
96 lines (75 loc) · 3.09 KB
/
normalization_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
require 'spec_helper'
require 'open-uri'
include TwitterCldr::Normalizers
describe 'Unicode Normalization Algorithms' do
NORMALIZERS_SPEC_PATH = File.dirname(__FILE__)
SHORT_TEST_PATH = File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTestShort.txt')
FULL_TEST_PATH = File.join(NORMALIZERS_SPEC_PATH, 'NormalizationTest.txt')
NORMALIZATION_TEST_URL = 'http://unicode.org/Public/UNIDATA/NormalizationTest.txt'
shared_examples_for 'a normalization algorithm' do
it 'passes all the tests in NormalizersTestShort.txt' do
run_normalization_test(described_class, invariants, SHORT_TEST_PATH)
end
it 'passes all the tests in NormalizersTest.txt', :slow => true do
prepare_full_test
run_normalization_test(described_class, invariants, FULL_TEST_PATH)
end
end
describe NFD do
let(:invariants) { { 3 => [1, 2, 3], 5 => [4, 5] } }
it_behaves_like 'a normalization algorithm'
end
describe NFKD do
let(:invariants) { { 5 => [1, 2, 3, 4, 5] } }
it_behaves_like 'a normalization algorithm'
end
# Runs standard Unicode normalization tests from `file_path` for a given `normalizer`. Expected invariants are
# specified via `invariants` hash.
#
# E.g., if `invariants` is { 2 => [1, 2, 3], 4 => [4, 5] } than the following invariants are expected to be true:
#
# c2 == normalized(c1) == normalized(c2) == normalized(c3)
# c4 == normalized(c4) == normalized(c5)
#
# where (c1, c2,...) are columns of the normalization test separated by semicolons and normalized() is the
# normalization function. Note, how expectation and tests columns indexes match the numbers in the `invariants` hash.
#
def run_normalization_test(normalizer, invariants, file_path)
open(file_path, 'r:UTF-8') do |file|
file.each do |line|
next if line.empty? || line =~ /^(@|#)/
data = line.split(';')[0...5].map { |cps| cps.split }
invariants.each do |expected_index, tests|
expected = data[expected_index - 1]
tests.each do |test_index|
test = data[test_index - 1]
normalized = normalizer.normalize_code_points(test)
message = normalization_error_message(line, test, expected, normalized, test_index, expected_index)
normalized.should(eq(expected), message)
end
end
end
end
end
# Generates helpful error message for normalization test failure.
#
def normalization_error_message(line, test, expected, normalized, test_index, expected_index)
<<-END
Test: "#{line.strip}"
Invariant: normalized(c#{test_index}) == c#{expected_index}
Expected: normalized(#{test.inspect}) == #{expected.inspect}
Got: #{normalized.inspect}
END
end
# Downloads full Unicode normalization tests suit if necessary.
#
def prepare_full_test
return if File.file?(FULL_TEST_PATH)
print ' Downloading NormalizationTest.txt ... '
open(FULL_TEST_PATH, 'w') { |file| file.write(open(NORMALIZATION_TEST_URL).read) }
puts 'done.'
end
end