/
component.rb
51 lines (42 loc) · 1.25 KB
/
component.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# encoding: UTF-8
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0
module TwitterCldr
module Parsers
class UnicodeRegexParser
class Component
protected
def to_utf8(codepoints)
# TODO: remove 1.8 workaround and fix tests
# note: we do this for ruby 1.8 compatibility
# if we didn't have to support 1.8, we could do this instead:
# Array(codepoints).map { |cp| "\\u{#{cp.to_s(16).rjust(4, "0")}}"}.join
Array(codepoints).pack("U*").bytes.map { |s| "\\" + s.to_s(8) }.join
end
def range_to_regex(range)
if range.first.is_a?(Array)
array_to_regex(range)
else
"[#{to_utf8(range.first)}-#{to_utf8(range.last)}]"
end
end
def array_to_regex(arr)
arr.map { |elem| "(?:#{to_utf8(elem)})" }.join
end
def set_to_regex(set)
strs = set.to_a(true).uniq.map do |obj|
case obj
when Range
range_to_regex(obj)
when Array
array_to_regex(obj)
else
to_utf8(obj)
end
end
"(?:#{strs.join("|")})"
end
end
end
end
end