Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #225 from NilsEnevoldsen/emojiflags
Add Unicode symbols (emoji flags)
- Loading branch information
Showing
7 changed files
with
356 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,251 @@ | ||
unicode.symbol,country | ||
🇦🇫,Afghanistan | ||
🇦🇱,Albania | ||
🇩🇿,Algeria | ||
🇦🇸,American Samoa | ||
🇦🇩,Andorra | ||
🇦🇴,Angola | ||
🇦🇮,Anguilla | ||
🇦🇶,Antarctica | ||
🇦🇬,Antigua and Barbuda | ||
🇦🇷,Argentina | ||
🇦🇲,Armenia | ||
🇦🇼,Aruba | ||
🇦🇺,Australia | ||
🇦🇹,Austria | ||
🇦🇿,Azerbaijan | ||
🇧🇸,Bahamas (the) | ||
🇧🇭,Bahrain | ||
🇧🇩,Bangladesh | ||
🇧🇧,Barbados | ||
🇧🇾,Belarus | ||
🇧🇪,Belgium | ||
🇧🇿,Belize | ||
🇧🇯,Benin | ||
🇧🇲,Bermuda | ||
🇧🇹,Bhutan | ||
🇧🇴,Bolivia (Plurinational State of) | ||
🇧🇶,"Bonaire, Sint Eustatius and Saba" | ||
🇧🇦,Bosnia and Herzegovina | ||
🇧🇼,Botswana | ||
🇧🇻,Bouvet Island | ||
🇧🇷,Brazil | ||
🇮🇴,British Indian Ocean Territory (the) | ||
🇧🇳,Brunei Darussalam | ||
🇧🇬,Bulgaria | ||
🇧🇫,Burkina Faso | ||
🇧🇮,Burundi | ||
🇨🇻,Cabo Verde | ||
🇰🇭,Cambodia | ||
🇨🇲,Cameroon | ||
🇨🇦,Canada | ||
🇰🇾,Cayman Islands (the) | ||
🇨🇫,Central African Republic (the) | ||
🇹🇩,Chad | ||
🇨🇱,Chile | ||
🇨🇳,China | ||
🇨🇽,Christmas Island | ||
🇨🇨,Cocos (Keeling) Islands (the) | ||
🇨🇴,Colombia | ||
🇰🇲,Comoros (the) | ||
🇨🇩,Congo (the Democratic Republic of the) | ||
🇨🇬,Congo (the) | ||
🇨🇰,Cook Islands (the) | ||
🇨🇷,Costa Rica | ||
🇭🇷,Croatia | ||
🇨🇺,Cuba | ||
🇨🇼,Curaçao | ||
🇨🇾,Cyprus | ||
🇨🇿,Czechia | ||
🇨🇮,Côte d'Ivoire | ||
🇩🇰,Denmark | ||
🇩🇯,Djibouti | ||
🇩🇲,Dominica | ||
🇩🇴,Dominican Republic (the) | ||
🇪🇨,Ecuador | ||
🇪🇬,Egypt | ||
🇸🇻,El Salvador | ||
🇬🇶,Equatorial Guinea | ||
🇪🇷,Eritrea | ||
🇪🇪,Estonia | ||
🇸🇿,Eswatini | ||
🇪🇹,Ethiopia | ||
🇫🇰,Falkland Islands (the) [Malvinas] | ||
🇫🇴,Faroe Islands (the) | ||
🇫🇯,Fiji | ||
🇫🇮,Finland | ||
🇫🇷,France | ||
🇬🇫,French Guiana | ||
🇵🇫,French Polynesia | ||
🇹🇫,French Southern Territories (the) | ||
🇬🇦,Gabon | ||
🇬🇲,Gambia (the) | ||
🇬🇪,Georgia | ||
🇩🇪,Germany | ||
🇬🇭,Ghana | ||
🇬🇮,Gibraltar | ||
🇬🇷,Greece | ||
🇬🇱,Greenland | ||
🇬🇩,Grenada | ||
🇬🇵,Guadeloupe | ||
🇬🇺,Guam | ||
🇬🇹,Guatemala | ||
🇬🇬,Guernsey | ||
🇬🇳,Guinea | ||
🇬🇼,Guinea-Bissau | ||
🇬🇾,Guyana | ||
🇭🇹,Haiti | ||
🇭🇲,Heard Island and McDonald Islands | ||
🇻🇦,Holy See (the) | ||
🇭🇳,Honduras | ||
🇭🇰,Hong Kong | ||
🇭🇺,Hungary | ||
🇮🇸,Iceland | ||
🇮🇳,India | ||
🇮🇩,Indonesia | ||
🇮🇷,Iran (Islamic Republic of) | ||
🇮🇶,Iraq | ||
🇮🇪,Ireland | ||
🇮🇲,Isle of Man | ||
🇮🇱,Israel | ||
🇮🇹,Italy | ||
🇯🇲,Jamaica | ||
🇯🇵,Japan | ||
🇯🇪,Jersey | ||
🇯🇴,Jordan | ||
🇰🇿,Kazakhstan | ||
🇰🇪,Kenya | ||
🇰🇮,Kiribati | ||
🇰🇵,Korea (the Democratic People's Republic of) | ||
🇰🇷,Korea (the Republic of) | ||
🇰🇼,Kuwait | ||
🇰🇬,Kyrgyzstan | ||
🇱🇦,Lao People's Democratic Republic (the) | ||
🇱🇻,Latvia | ||
🇱🇧,Lebanon | ||
🇱🇸,Lesotho | ||
🇱🇷,Liberia | ||
🇱🇾,Libya | ||
🇱🇮,Liechtenstein | ||
🇱🇹,Lithuania | ||
🇱🇺,Luxembourg | ||
🇲🇴,Macao | ||
🇲🇬,Madagascar | ||
🇲🇼,Malawi | ||
🇲🇾,Malaysia | ||
🇲🇻,Maldives | ||
🇲🇱,Mali | ||
🇲🇹,Malta | ||
🇲🇭,Marshall Islands (the) | ||
🇲🇶,Martinique | ||
🇲🇷,Mauritania | ||
🇲🇺,Mauritius | ||
🇾🇹,Mayotte | ||
🇲🇽,Mexico | ||
🇫🇲,Micronesia (Federated States of) | ||
🇲🇩,Moldova (the Republic of) | ||
🇲🇨,Monaco | ||
🇲🇳,Mongolia | ||
🇲🇪,Montenegro | ||
🇲🇸,Montserrat | ||
🇲🇦,Morocco | ||
🇲🇿,Mozambique | ||
🇲🇲,Myanmar | ||
🇳🇦,Namibia | ||
🇳🇷,Nauru | ||
🇳🇵,Nepal | ||
🇳🇱,Netherlands (the) | ||
🇳🇨,New Caledonia | ||
🇳🇿,New Zealand | ||
🇳🇮,Nicaragua | ||
🇳🇪,Niger (the) | ||
🇳🇬,Nigeria | ||
🇳🇺,Niue | ||
🇳🇫,Norfolk Island | ||
🇲🇰,North Macedonia | ||
🇲🇵,Northern Mariana Islands (the) | ||
🇳🇴,Norway | ||
🇴🇲,Oman | ||
🇵🇰,Pakistan | ||
🇵🇼,Palau | ||
🇵🇸,"Palestine, State of" | ||
🇵🇦,Panama | ||
🇵🇬,Papua New Guinea | ||
🇵🇾,Paraguay | ||
🇵🇪,Peru | ||
🇵🇭,Philippines (the) | ||
🇵🇳,Pitcairn | ||
🇵🇱,Poland | ||
🇵🇹,Portugal | ||
🇵🇷,Puerto Rico | ||
🇶🇦,Qatar | ||
🇷🇴,Romania | ||
🇷🇺,Russian Federation (the) | ||
🇷🇼,Rwanda | ||
🇷🇪,Réunion | ||
🇧🇱,Saint Barthélemy | ||
🇸🇭,"Saint Helena, Ascension and Tristan da Cunha" | ||
🇰🇳,Saint Kitts and Nevis | ||
🇱🇨,Saint Lucia | ||
🇲🇫,Saint Martin (French part) | ||
🇵🇲,Saint Pierre and Miquelon | ||
🇻🇨,Saint Vincent and the Grenadines | ||
🇼🇸,Samoa | ||
🇸🇲,San Marino | ||
🇸🇹,Sao Tome and Principe | ||
🇸🇦,Saudi Arabia | ||
🇸🇳,Senegal | ||
🇷🇸,Serbia | ||
🇸🇨,Seychelles | ||
🇸🇱,Sierra Leone | ||
🇸🇬,Singapore | ||
🇸🇽,Sint Maarten (Dutch part) | ||
🇸🇰,Slovakia | ||
🇸🇮,Slovenia | ||
🇸🇧,Solomon Islands | ||
🇸🇴,Somalia | ||
🇿🇦,South Africa | ||
🇬🇸,South Georgia and the South Sandwich Islands | ||
🇸🇸,South Sudan | ||
🇪🇸,Spain | ||
🇱🇰,Sri Lanka | ||
🇸🇩,Sudan (the) | ||
🇸🇷,Suriname | ||
🇸🇯,Svalbard and Jan Mayen | ||
🇸🇪,Sweden | ||
🇨🇭,Switzerland | ||
🇸🇾,Syrian Arab Republic (the) | ||
🇹🇼,Taiwan (Province of China) | ||
🇹🇯,Tajikistan | ||
🇹🇿,"Tanzania, the United Republic of" | ||
🇹🇭,Thailand | ||
🇹🇱,Timor-Leste | ||
🇹🇬,Togo | ||
🇹🇰,Tokelau | ||
🇹🇴,Tonga | ||
🇹🇹,Trinidad and Tobago | ||
🇹🇳,Tunisia | ||
🇹🇷,Turkey | ||
🇹🇲,Turkmenistan | ||
🇹🇨,Turks and Caicos Islands (the) | ||
🇹🇻,Tuvalu | ||
🇺🇬,Uganda | ||
🇺🇦,Ukraine | ||
🇦🇪,United Arab Emirates (the) | ||
🇬🇧,United Kingdom | ||
🇺🇲,United States Minor Outlying Islands (the) | ||
🇺🇸,United States of America (the) | ||
🇺🇾,Uruguay | ||
🇺🇿,Uzbekistan | ||
🇻🇺,Vanuatu | ||
🇻🇪,Venezuela (Bolivarian Republic of) | ||
🇻🇳,Viet Nam | ||
🇻🇬,Virgin Islands (British) | ||
🇻🇮,Virgin Islands (U.S.) | ||
🇼🇫,Wallis and Futuna | ||
🇪🇭,Western Sahara | ||
🇾🇪,Yemen | ||
🇿🇲,Zambia | ||
🇿🇼,Zimbabwe | ||
🇦🇽,Åland Islands | ||
🇽🇰,Kosovo |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
require(dplyr) | ||
require(tibble) | ||
require(purrr) | ||
require(stringr) | ||
require(httr) | ||
require(xml2) | ||
require(assertthat) | ||
|
||
# This script creates Unicode region subtags as pairs of Regional Indicator Symbols. Region subtags represent regions, | ||
# not flags. When composed of Regional Indicator Symbols, they are displayed as flags on most systems, but on Windows | ||
# they are displayed as pairs of letters. | ||
# https://en.wikipedia.org/wiki/Regional_Indicator_Symbol | ||
|
||
# The location specified below lists valid Unicode region subtags. They are based on ISO 3166-1 alpha-2. | ||
# https://unicode.org/reports/tr51/#Flags | ||
# 'While the syntax of a well-formed emoji flag sequence is defined in ED-14, only valid sequences are displayed as flags | ||
# by conformant implementations, where: The valid region sequences are specified by Unicode region subtags as defined in | ||
# [CLDR], with idStatus=regular, deprecated, or macroregion. For macroregions, only UN and EU are valid.' | ||
# idStatus='deprecated' isn't relevant for countrycode because they can be many-to-one. | ||
# idStatus='macrocode' isn't relevant for countrycode because countrycode doesn't track supranational bodies. | ||
|
||
tar.url <- 'https://github.com/unicode-org/cldr/archive/latest.tar.gz' | ||
region.file <- 'cldr-latest/common/validity/region.xml' | ||
xpath <- '/supplementalData/idValidity/id[@type="region"][@idStatus="regular"]' | ||
|
||
# Exceptional reservations are valid, but aren't entities in countrycode, so track them for exclusion. | ||
# TODO: This should not be hardcoded. Ideally it would have a scraper similar to get_iso.R. | ||
|
||
exceptional.reservations <- c('AC','CP','DG','EA','IC','TA') | ||
|
||
# Functions to expand string ranges | ||
# https://www.unicode.org/reports/tr35/#String_Range | ||
|
||
letter.pair.to.integer <- function(pair) (match(stringr::str_sub(pair, 1, 1), LETTERS) - 1) * 26 + match(stringr::str_sub(pair, 2, 2), LETTERS) - 1 | ||
|
||
integer.to.letter.pair <- function(int) paste0(LETTERS[int %/% 26 + 1], LETTERS[int %% 26 + 1]) | ||
|
||
string.range.expand <- function(string.range) { | ||
stringr::str_replace_all(string.range, '([:upper:])([:upper:])~([:upper:])', '\\1\\2:\\1\\3') %>% | ||
stringr::str_replace_all('([:upper:][:upper:])', function (x) {letter.pair.to.integer(x) %>% as.character()}) %>% | ||
stringr::str_split('[:space:]') %>% | ||
unlist() %>% | ||
purrr::map(~ parse(text=.x) %>% eval()) %>% | ||
unlist() %>% | ||
stringr::str_replace_all('([:digit:]+)', function (x) {as.integer(x) %>% integer.to.letter.pair()}) | ||
} | ||
|
||
# Get valid regions | ||
|
||
tarfile <- tempfile(fileext = '.tar.gz') | ||
httr::GET(tar.url, httr::write_disk(tarfile)) | ||
utils::untar(tarfile, files = region.file, exdir = tempdir()) | ||
valid.regions <- xml2::read_xml(paste(tempdir(),region.file,sep='/')) %>% | ||
xml2::xml_find_first(xpath) %>% | ||
xml2::xml_text() %>% | ||
string.range.expand() | ||
|
||
# Get ISO 3166-1 alpha-2 codes | ||
|
||
iso <- read_csv('dictionary/data_iso.csv', col_types = cols(), progress = FALSE) %>% | ||
dplyr::mutate(iso2c = dplyr::if_else(country == 'Namibia', 'NA', iso2c)) | ||
|
||
# Check that all ISO 3166-1 alpha-2 codes in countrycode have the corresponding Unicode region subtag. | ||
|
||
assertthat::assert_that(all(iso$iso2c %in% valid.regions)) | ||
|
||
# Associate regions with country names by comparing to ISO 3166-1 alpha-2. | ||
# Note that Unicode CLDR assigns XK to Kosovo. | ||
|
||
unicode.symbol <- iso %>% | ||
dplyr::mutate(unicode.symbol = dplyr::if_else(iso2c %in% valid.regions, iso2c, NA_character_)) %>% | ||
tibble::add_row(unicode.symbol = 'XK', country = 'Kosovo') | ||
|
||
# Check that all valid Unicode region subtags have been assigned in countrycode. | ||
|
||
assertthat::assert_that(all(valid.regions %in% c(unicode.symbol$unicode.symbol,exceptional.reservations))) | ||
|
||
# Express region subtags as Regional Indicator Symbols and generate the CSV. | ||
|
||
unicode.symbol %>% | ||
rowwise %>% | ||
mutate(unicode.symbol = utf8ToInt(unicode.symbol) %>% | ||
`-`(65) %>% # Decimal 65 (Hex 41) is the Unicode codepoint for 'LATIN CAPITAL LETTER A' | ||
`+`(127462) %>% # Decimal 127462 (Hex 1F1E6) is the Unicode codepoint for 'REGIONAL INDICATOR SYMBOL LETTER A' | ||
intToUtf8) %>% | ||
dplyr::select(c('unicode.symbol','country')) %>% | ||
write_csv('dictionary/data_unicode_symbol.csv') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
context('Emoji flags') | ||
|
||
test_that('converting to and from emoji works', { | ||
expect_equal(countrycode('Antarctica','country.name','unicode.symbol'), '🇦🇶') | ||
expect_equal(countrycode('🇦🇶','unicode.symbol','country.name'), 'Antarctica') | ||
}) | ||
|
||
test_that('unicode.symbol-to-country.name-to-unicode.symbol is internally consistent', { | ||
for(unicode.symbol.original in codelist$unicode.symbol){ | ||
if(!is.na(unicode.symbol.original)){ | ||
name <- countrycode(unicode.symbol.original, 'unicode.symbol', 'country.name') | ||
unicode.symbol.result <- countrycode(name, 'country.name', 'unicode.symbol') | ||
expect_equal(unicode.symbol.result, unicode.symbol.original) | ||
} | ||
} | ||
}) |