Skip to content

Commit

Permalink
Download missing input data in resources rake tasks.
Browse files Browse the repository at this point in the history
  • Loading branch information
KL-7 committed Jul 31, 2012
1 parent 874d787 commit eb7a954
Show file tree
Hide file tree
Showing 6 changed files with 114 additions and 47 deletions.
56 changes: 28 additions & 28 deletions Rakefile
Expand Up @@ -42,55 +42,55 @@ end

namespace :resources do
namespace :update do
desc 'Import tailoring resources from CLDR data (should be executed using JRuby 1.7 in 1.9 mode)'
task :tailoring, :cldr_data_path, :icu4j_jar_path do |_, args|
importer = TwitterCldr::Resources::TailoringImporter.new(
args[:tailoring_data_path] || '../cldr/tailoring/',
'./resources/collation/tailoring',
args[:icu4j_jar_path] ||'../icu4j-49_1.jar'
)

TwitterCldr.supported_locales.each { |locale| importer.import(locale) }
desc 'Import locales resources'
task :locales_resources, :cldr_path do |_, args|
TwitterCldr::Resources::LocalesResourcesImporter.new(
args[:cldr_path] || '../cldr',
'./resources/locales'
).import
end

desc 'Update default and tailoring tries dumps'
task :tries do
TwitterCldr::Resources::TriesDumper.update_dumps
desc 'Import custom locales resources'
task :custom_locales_resources do
TwitterCldr::Resources::CustomLocalesResourcesImporter.new('./resources/custom/locales').import
end

desc 'Import tailoring resources from CLDR data (should be executed using JRuby 1.7 in 1.9 mode)'
task :tailoring, :cldr_path, :icu4j_jar_path do |_, args|
TwitterCldr::Resources::TailoringImporter.new(
args[:cldr_path] || '../cldr',
'./resources/collation/tailoring',
args[:icu4j_jar_path] ||'../icu4j-49_1.jar'
).import(TwitterCldr.supported_locales)
end

desc 'Import Unicode data resources'
task :unicode_data, :unicode_data_path do |_, args|
TwitterCldr::Resources::UnicodeDataImporter.new(
args[:unicode_data_path] || '../cldr/unicode-data',
args[:unicode_data_path] || '../unicode-data',
'./resources/unicode_data'
).import
end

desc 'Update canonical compositions resource'
task :canonical_compositions do
TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
end

desc 'Import composition exclusions resource'
task :composition_exclusions do |_, args|
task :composition_exclusions, :derived_normalization_props_path do |_, args|
TwitterCldr::Resources::CompositionExclusionsImporter.new(
args[:derived_normalization_props_path] || '../cldr/DerivedNormalizationProps.txt',
args[:derived_normalization_props_path] || '../unicode-data/DerivedNormalizationProps.txt',
'./resources/unicode_data'
).import
end

desc 'Import locales resources'
task :locales_resources do |_, args|
TwitterCldr::Resources::LocalesResourcesImporter.new(
args[:cldr_data_path] || '../cldr/cldr-data',
'./resources/locales'
).import
desc 'Update default and tailoring tries dumps'
task :tries do
TwitterCldr::Resources::TriesDumper.update_dumps
end

desc 'Import custom locales resources'
task :custom_locales_resources do
TwitterCldr::Resources::CustomLocalesResourcesImporter.new('./resources/custom/locales').import
desc 'Update canonical compositions resource'
task :canonical_compositions do
TwitterCldr::Resources::CanonicalCompositionsUpdater.new('./resources/unicode_data').update
end

end
end

Expand Down
12 changes: 9 additions & 3 deletions lib/twitter_cldr/resources/composition_exclusions_importer.rb
Expand Up @@ -3,18 +3,20 @@
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'lib/twitter_cldr/resources/download'

module TwitterCldr
module Resources

class CompositionExclusionsImporter

COMPOSITION_EXCLUSIONS_URL = 'http://www.unicode.org/Public/6.1.0/ucd/DerivedNormalizationProps.txt'
COMPOSITION_EXCLUSION_REGEXP = /^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+; Full_Composition_Exclusion #.*$/
TOTAL_CODE_POINTS_REGEXP = /^# Total code points: (\d+)$/

# Arguments:
#
# input_file - path to DerivedNormalizationProps.txt file (available at http://www.unicode.org/Public/6.1.0/ucd/DerivedNormalizationProps.txt)
#
# input_file - path to DerivedNormalizationProps.txt file
# output_path - output directory for generated YAML file
#
def initialize(input_file, output_path)
Expand All @@ -31,7 +33,7 @@ def import
private

def generate_composition_exclusions
data = File.open(@input_file) { |file| file.read }
data = File.open(composition_exclusions_file) { |file| file.read }
start_pos = data.index("# Derived Property: Full_Composition_Exclusion")
end_pos = data.index(/^#\s=*$/, start_pos)
data = data[start_pos..end_pos].split("\n")
Expand All @@ -51,6 +53,10 @@ def generate_composition_exclusions
result
end

def composition_exclusions_file
TwitterCldr::Resources.download_if_necessary(@input_file, COMPOSITION_EXCLUSIONS_URL)
end

end
end
end
41 changes: 41 additions & 0 deletions lib/twitter_cldr/resources/download.rb
@@ -0,0 +1,41 @@
# encoding: UTF-8

# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'cldr/download'

module TwitterCldr
module Resources

CLDR_URL = 'http://unicode.org/Public/cldr/21/core.zip'

class << self

def download_if_necessary(path, url)
if File.file?(path)
puts "Using '#{path}'."
else
puts "Downloading '#{url}' to '#{path}'."
FileUtils.mkdir_p(File.dirname(path))
system("curl #{url} -o #{path}")
end

path
end

def download_cldr_if_necessary(path, url = CLDR_URL)
if File.directory?(path)
puts "Using CLDR data from '#{path}'."
else
puts "Downloading CLDR data from '#{url}' to '#{path}'."
Cldr.download(url, path)
end

path
end

end

end
end
10 changes: 8 additions & 2 deletions lib/twitter_cldr/resources/locales_resources_importer.rb
Expand Up @@ -3,16 +3,22 @@
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'cldr/download'
require 'cldr/export'

require 'lib/twitter_cldr/resources/download'

module TwitterCldr
module Resources

COMPONENTS = %w[calendars languages numbers units plurals]

class LocalesResourcesImporter

# Arguments:
#
# input_path - path to a directory containing CLDR data
# output_path - output directory for imported YAML files
#
def initialize(input_path, output_path)
@input_path = input_path
@output_path = output_path
Expand All @@ -26,7 +32,7 @@ def import
private

def prepare_ruby_cldr
Cldr.download(nil, @input_path) unless File.directory?(@input_path)
TwitterCldr::Resources.download_cldr_if_necessary(@input_path)
Cldr::Export::Data.dir = File.join(@input_path, 'common')
end

Expand Down
21 changes: 12 additions & 9 deletions lib/twitter_cldr/resources/tailoring_importer.rb
Expand Up @@ -6,6 +6,8 @@
require 'nokogiri'
require 'java'

require 'lib/twitter_cldr/resources/download'

module TwitterCldr
module Resources
# This class should be used with JRuby 1.7 in 1.9 mode and ICU4J version 49.1 (available at
Expand Down Expand Up @@ -36,12 +38,8 @@ class ImportError < RuntimeError; end

# Arguments:
#
# input_path - path to a directory containing CLDR tailoring data (available at
# http://unicode.org/cldr/trac/browser/tags/release-21/common/collation/
# or as a part of CLDR release at http://cldr.unicode.org/index/downloads)
#
# input_path - path to a directory containing CLDR data
# output_path - output directory for imported YAML files
#
# icu4j_path - path to ICU4J jar file
#
def initialize(input_path, output_path, icu4j_path)
Expand All @@ -51,7 +49,14 @@ def initialize(input_path, output_path, icu4j_path)
@output_path = output_path
end

def import(locale)
def import(locales)
TwitterCldr::Resources.download_cldr_if_necessary(@input_path)
locales.each { |locale| import_locale(locale) }
end

private

def import_locale(locale)
print "Importing %8s\t--\t" % locale

if tailoring_present?(locale)
Expand All @@ -65,8 +70,6 @@ def import(locale)
puts "Error: #{e.message}"
end

private

def dump(locale, data)
File.open(resource_file_path(locale), 'w') { |file| YAML.dump(data, file) }
end
Expand All @@ -80,7 +83,7 @@ def translated_locale(locale)
end

def locale_file_path(locale)
File.join(@input_path, "#{translated_locale(locale)}.xml")
File.join(@input_path, 'common', 'collation', "#{translated_locale(locale)}.xml")
end

def resource_file_path(locale)
Expand Down
21 changes: 16 additions & 5 deletions lib/twitter_cldr/resources/unicode_data_importer.rb
Expand Up @@ -3,16 +3,19 @@
# Copyright 2012 Twitter, Inc
# http://www.apache.org/licenses/LICENSE-2.0

require 'lib/twitter_cldr/resources/download'

module TwitterCldr
module Resources

class UnicodeDataImporter

BLOCKS_URL = 'ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt'
UNICODE_DATA_URL = 'ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt'

# Arguments:
#
# input_path - path to a directory containing Blocks.txt (available at ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt)
# and UnicodeData.txt (available at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt)
#
# input_path - path to a directory containing Blocks.txt and UnicodeData.txt
# output_path - output directory for imported YAML files
#
def initialize(input_path, output_path)
Expand All @@ -36,7 +39,7 @@ def import
def import_blocks
blocks = {}

File.open(File.join(@input_path, 'Blocks.txt')) do |input|
File.open(blocks_file) do |input|
input.each_line do |line|
next unless line =~ /^([0-9A-F]+)\.\.([0-9A-F]+);(.+)$/

Expand All @@ -53,7 +56,7 @@ def import_blocks
def import_unicode_data(blocks)
unicode_data = Hash.new { |hash, key| hash[key] = Hash.new { |h, k| h[k] = {} } }

File.open(File.join(@input_path, 'UnicodeData.txt')) do |input|
File.open(unicode_data_file) do |input|
input.each_line do |line|
data = line.chomp.split(';', -1)
data[0] = data[0].hex
Expand All @@ -65,6 +68,14 @@ def import_unicode_data(blocks)
unicode_data
end

def unicode_data_file
TwitterCldr::Resources.download_if_necessary(File.join(@input_path, 'UnicodeData.txt'), UNICODE_DATA_URL)
end

def blocks_file
TwitterCldr::Resources.download_if_necessary(File.join(@input_path, 'Blocks.txt'), BLOCKS_URL)
end

def find_block(blocks, code_point)
blocks.detect { |_, range| range.include?(code_point) }
end
Expand Down

0 comments on commit eb7a954

Please sign in to comment.