Skip to content

Commit

Permalink
Merge branch 'script_detector' into likely_subtags
Browse files Browse the repository at this point in the history
  • Loading branch information
camertron committed Sep 26, 2015
2 parents c64b5da + e95af1d commit 4d3a824
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 12 deletions.
2 changes: 1 addition & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ namespace :update do
task :unicode_scripts, :unicode_scripts_path do |_, args|
TwitterCldr::Resources::UnicodeScriptsImporter.new(
args[:unicode_scripts_path] || './vendor/unicode-data',
'./resources/unicode_data/scripts.yml'
'./resources/unicode_data/properties/scripts.yml'
).import
end

Expand Down
16 changes: 11 additions & 5 deletions lib/twitter_cldr/parsers/unicode_regex/character_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ class UnicodeRegexParser
# Can exist inside and outside of character classes
class CharacterSet < Component

include TwitterCldr::Shared

attr_reader :property, :property_value

def initialize(text)
Expand Down Expand Up @@ -37,8 +39,8 @@ def codepoints
if property
method = :"code_points_for_#{property}"

if TwitterCldr::Shared::CodePoint.respond_to?(method)
ranges = TwitterCldr::Shared::CodePoint.send(method, property_value)
if CodePoint.respond_to?(method)
ranges = CodePoint.send(method, property_value)

if ranges
TwitterCldr::Utils::RangeSet.new(ranges)
Expand All @@ -53,9 +55,13 @@ def codepoints
)
end
else
TwitterCldr::Utils::RangeSet.new(
TwitterCldr::Shared::CodePoint.code_points_for_property_value(property_value)
)
code_points = CodePoint.code_points_for_property_value(property_value)

if code_points.empty?
code_points = CodePoint.code_points_for_script(property_value) || []
end

TwitterCldr::Utils::RangeSet.new(code_points)
end
end

Expand Down
4 changes: 2 additions & 2 deletions lib/twitter_cldr/shared/code_point.rb
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class CodePoint
]

PROPERTIES = [
:sentence_break, :line_break, :word_break
:sentence_break, :line_break, :word_break, :script
]

attr_reader :fields
Expand Down Expand Up @@ -239,4 +239,4 @@ def get_range_start(code_point, block_data)
end
end
end
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def tokenizer
recognizers = [
# The variable name can contain letters and digits, but must start with a letter.
TokenRecognizer.new(:variable, /\$\w[\w\d]*/),
TokenRecognizer.new(:character_set, /\[:\w+:\]|\\p\{[\w=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
TokenRecognizer.new(:negated_character_set, /\[:\^\w+:\]|\\P\{[\w=]+\}/), #[:^Lu:] or \P{Lu}
TokenRecognizer.new(:character_set, /\[:[\w\s]+:\]|\\p\{[\w=]+\}/), # [:Lu:] or \p{Lu} or \p{Sentence_Break=CF}
TokenRecognizer.new(:negated_character_set, /\[:\^[\w\s]+:\]|\\P\{[\w=]+\}/), #[:^Lu:] or \P{Lu}
TokenRecognizer.new(:unicode_char, /\\u\{?[a-fA-F0-9]{1,6}\}?/),
TokenRecognizer.new(:multichar_string, /\{\w+\}/u),

Expand All @@ -49,4 +49,4 @@ def tokenizer

end
end
end
end
4 changes: 3 additions & 1 deletion lib/twitter_cldr/utils/script_detector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def scripts_hash
end

def resource
@resource ||= TwitterCldr.get_resource('unicode_data', 'scripts')
@resource ||= TwitterCldr.get_resource(
'unicode_data', 'properties', 'script'
)
end

end
Expand Down
File renamed without changes.
8 changes: 8 additions & 0 deletions spec/parsers/unicode_regex/character_set_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@
])
end

it "should return a set containing codepoints for the given script" do
char_set = UnicodeRegexParser::CharacterSet.new('Katakana')
expect(char_set.to_set.to_a(true)).to eq([
12449..12538, 12541..12543, 12784..12799, 13008..13054, 13056..13143,
65382..65391, 65393..65437, 110592
])
end

it "should raise an exception when given an invalid property name or value" do
expect do
UnicodeRegexParser::CharacterSet.new("Foobar=Sp").to_set
Expand Down

0 comments on commit 4d3a824

Please sign in to comment.