Skip to content

Commit

Permalink
Merge pull request #225 from NilsEnevoldsen/emojiflags
Browse files Browse the repository at this point in the history
Add Unicode symbols (emoji flags)
  • Loading branch information
vincentarelbundock committed May 11, 2020
2 parents 0e41a9f + fc8a78f commit 06e9b01
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 1 deletion.
1 change: 1 addition & 0 deletions R/codelist.R
Expand Up @@ -28,6 +28,7 @@
#' \item p4n: Polity IV numeric country code
#' \item p4c: Polity IV character country code
#' \item un: United Nations M49 numeric codes
#' \item unicode.symbol: Region subtag (often displayed as emoji flag)
#' \item unpd: United Nations Procurement Division
#' \item vdem: Varieties of Democracy (V-Dem version 8, April 2018)
#' \item wb: World Bank (very similar but not identical to iso3c)
Expand Down
2 changes: 1 addition & 1 deletion R/countrycode.R
Expand Up @@ -88,7 +88,7 @@ countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = N
valid_origin = c("country.name", "country.name.de", "cowc", "cown", "dhs",
"ecb", "eurostat", "fao", "fips", "gaul", "genc2c",
"genc3c", "genc3n", "gwc", "gwn", "imf", "ioc", "iso2c", "iso3c",
"iso3n", "p4c", "p4n", "un", "un_m49", "unpd",
"iso3n", "p4c", "p4n", "un", "un_m49", "unicode.symbol", "unpd",
"vdem", "wb", "wb_api2c", "wb_api3c", "wvs",
"country.name.en.regex", "country.name.de.regex")
valid_destination <- colnames(dictionary)
Expand Down
Binary file modified data/codelist.rda
Binary file not shown.
Binary file modified data/codelist_panel.rda
Binary file not shown.
251 changes: 251 additions & 0 deletions dictionary/data_unicode_symbol.csv
@@ -0,0 +1,251 @@
unicode.symbol,country
🇦🇫,Afghanistan
🇦🇱,Albania
🇩🇿,Algeria
🇦🇸,American Samoa
🇦🇩,Andorra
🇦🇴,Angola
🇦🇮,Anguilla
🇦🇶,Antarctica
🇦🇬,Antigua and Barbuda
🇦🇷,Argentina
🇦🇲,Armenia
🇦🇼,Aruba
🇦🇺,Australia
🇦🇹,Austria
🇦🇿,Azerbaijan
🇧🇸,Bahamas (the)
🇧🇭,Bahrain
🇧🇩,Bangladesh
🇧🇧,Barbados
🇧🇾,Belarus
🇧🇪,Belgium
🇧🇿,Belize
🇧🇯,Benin
🇧🇲,Bermuda
🇧🇹,Bhutan
🇧🇴,Bolivia (Plurinational State of)
🇧🇶,"Bonaire, Sint Eustatius and Saba"
🇧🇦,Bosnia and Herzegovina
🇧🇼,Botswana
🇧🇻,Bouvet Island
🇧🇷,Brazil
🇮🇴,British Indian Ocean Territory (the)
🇧🇳,Brunei Darussalam
🇧🇬,Bulgaria
🇧🇫,Burkina Faso
🇧🇮,Burundi
🇨🇻,Cabo Verde
🇰🇭,Cambodia
🇨🇲,Cameroon
🇨🇦,Canada
🇰🇾,Cayman Islands (the)
🇨🇫,Central African Republic (the)
🇹🇩,Chad
🇨🇱,Chile
🇨🇳,China
🇨🇽,Christmas Island
🇨🇨,Cocos (Keeling) Islands (the)
🇨🇴,Colombia
🇰🇲,Comoros (the)
🇨🇩,Congo (the Democratic Republic of the)
🇨🇬,Congo (the)
🇨🇰,Cook Islands (the)
🇨🇷,Costa Rica
🇭🇷,Croatia
🇨🇺,Cuba
🇨🇼,Curaçao
🇨🇾,Cyprus
🇨🇿,Czechia
🇨🇮,Côte d'Ivoire
🇩🇰,Denmark
🇩🇯,Djibouti
🇩🇲,Dominica
🇩🇴,Dominican Republic (the)
🇪🇨,Ecuador
🇪🇬,Egypt
🇸🇻,El Salvador
🇬🇶,Equatorial Guinea
🇪🇷,Eritrea
🇪🇪,Estonia
🇸🇿,Eswatini
🇪🇹,Ethiopia
🇫🇰,Falkland Islands (the) [Malvinas]
🇫🇴,Faroe Islands (the)
🇫🇯,Fiji
🇫🇮,Finland
🇫🇷,France
🇬🇫,French Guiana
🇵🇫,French Polynesia
🇹🇫,French Southern Territories (the)
🇬🇦,Gabon
🇬🇲,Gambia (the)
🇬🇪,Georgia
🇩🇪,Germany
🇬🇭,Ghana
🇬🇮,Gibraltar
🇬🇷,Greece
🇬🇱,Greenland
🇬🇩,Grenada
🇬🇵,Guadeloupe
🇬🇺,Guam
🇬🇹,Guatemala
🇬🇬,Guernsey
🇬🇳,Guinea
🇬🇼,Guinea-Bissau
🇬🇾,Guyana
🇭🇹,Haiti
🇭🇲,Heard Island and McDonald Islands
🇻🇦,Holy See (the)
🇭🇳,Honduras
🇭🇰,Hong Kong
🇭🇺,Hungary
🇮🇸,Iceland
🇮🇳,India
🇮🇩,Indonesia
🇮🇷,Iran (Islamic Republic of)
🇮🇶,Iraq
🇮🇪,Ireland
🇮🇲,Isle of Man
🇮🇱,Israel
🇮🇹,Italy
🇯🇲,Jamaica
🇯🇵,Japan
🇯🇪,Jersey
🇯🇴,Jordan
🇰🇿,Kazakhstan
🇰🇪,Kenya
🇰🇮,Kiribati
🇰🇵,Korea (the Democratic People's Republic of)
🇰🇷,Korea (the Republic of)
🇰🇼,Kuwait
🇰🇬,Kyrgyzstan
🇱🇦,Lao People's Democratic Republic (the)
🇱🇻,Latvia
🇱🇧,Lebanon
🇱🇸,Lesotho
🇱🇷,Liberia
🇱🇾,Libya
🇱🇮,Liechtenstein
🇱🇹,Lithuania
🇱🇺,Luxembourg
🇲🇴,Macao
🇲🇬,Madagascar
🇲🇼,Malawi
🇲🇾,Malaysia
🇲🇻,Maldives
🇲🇱,Mali
🇲🇹,Malta
🇲🇭,Marshall Islands (the)
🇲🇶,Martinique
🇲🇷,Mauritania
🇲🇺,Mauritius
🇾🇹,Mayotte
🇲🇽,Mexico
🇫🇲,Micronesia (Federated States of)
🇲🇩,Moldova (the Republic of)
🇲🇨,Monaco
🇲🇳,Mongolia
🇲🇪,Montenegro
🇲🇸,Montserrat
🇲🇦,Morocco
🇲🇿,Mozambique
🇲🇲,Myanmar
🇳🇦,Namibia
🇳🇷,Nauru
🇳🇵,Nepal
🇳🇱,Netherlands (the)
🇳🇨,New Caledonia
🇳🇿,New Zealand
🇳🇮,Nicaragua
🇳🇪,Niger (the)
🇳🇬,Nigeria
🇳🇺,Niue
🇳🇫,Norfolk Island
🇲🇰,North Macedonia
🇲🇵,Northern Mariana Islands (the)
🇳🇴,Norway
🇴🇲,Oman
🇵🇰,Pakistan
🇵🇼,Palau
🇵🇸,"Palestine, State of"
🇵🇦,Panama
🇵🇬,Papua New Guinea
🇵🇾,Paraguay
🇵🇪,Peru
🇵🇭,Philippines (the)
🇵🇳,Pitcairn
🇵🇱,Poland
🇵🇹,Portugal
🇵🇷,Puerto Rico
🇶🇦,Qatar
🇷🇴,Romania
🇷🇺,Russian Federation (the)
🇷🇼,Rwanda
🇷🇪,Réunion
🇧🇱,Saint Barthélemy
🇸🇭,"Saint Helena, Ascension and Tristan da Cunha"
🇰🇳,Saint Kitts and Nevis
🇱🇨,Saint Lucia
🇲🇫,Saint Martin (French part)
🇵🇲,Saint Pierre and Miquelon
🇻🇨,Saint Vincent and the Grenadines
🇼🇸,Samoa
🇸🇲,San Marino
🇸🇹,Sao Tome and Principe
🇸🇦,Saudi Arabia
🇸🇳,Senegal
🇷🇸,Serbia
🇸🇨,Seychelles
🇸🇱,Sierra Leone
🇸🇬,Singapore
🇸🇽,Sint Maarten (Dutch part)
🇸🇰,Slovakia
🇸🇮,Slovenia
🇸🇧,Solomon Islands
🇸🇴,Somalia
🇿🇦,South Africa
🇬🇸,South Georgia and the South Sandwich Islands
🇸🇸,South Sudan
🇪🇸,Spain
🇱🇰,Sri Lanka
🇸🇩,Sudan (the)
🇸🇷,Suriname
🇸🇯,Svalbard and Jan Mayen
🇸🇪,Sweden
🇨🇭,Switzerland
🇸🇾,Syrian Arab Republic (the)
🇹🇼,Taiwan (Province of China)
🇹🇯,Tajikistan
🇹🇿,"Tanzania, the United Republic of"
🇹🇭,Thailand
🇹🇱,Timor-Leste
🇹🇬,Togo
🇹🇰,Tokelau
🇹🇴,Tonga
🇹🇹,Trinidad and Tobago
🇹🇳,Tunisia
🇹🇷,Turkey
🇹🇲,Turkmenistan
🇹🇨,Turks and Caicos Islands (the)
🇹🇻,Tuvalu
🇺🇬,Uganda
🇺🇦,Ukraine
🇦🇪,United Arab Emirates (the)
🇬🇧,United Kingdom
🇺🇲,United States Minor Outlying Islands (the)
🇺🇸,United States of America (the)
🇺🇾,Uruguay
🇺🇿,Uzbekistan
🇻🇺,Vanuatu
🇻🇪,Venezuela (Bolivarian Republic of)
🇻🇳,Viet Nam
🇻🇬,Virgin Islands (British)
🇻🇮,Virgin Islands (U.S.)
🇼🇫,Wallis and Futuna
🇪🇭,Western Sahara
🇾🇪,Yemen
🇿🇲,Zambia
🇿🇼,Zimbabwe
🇦🇽,Åland Islands
🇽🇰,Kosovo
87 changes: 87 additions & 0 deletions dictionary/get_unicode_symbol.R
@@ -0,0 +1,87 @@
require(dplyr)
require(tibble)
require(purrr)
require(stringr)
require(httr)
require(xml2)
require(assertthat)

# This script creates Unicode region subtags as pairs of Regional Indicator Symbols. Region subtags represent regions,
# not flags. When composed of Regional Indicator Symbols, they are displayed as flags on most systems, but on Windows
# they are displayed as pairs of letters.
# https://en.wikipedia.org/wiki/Regional_Indicator_Symbol

# The location specified below lists valid Unicode region subtags. They are based on ISO 3166-1 alpha-2.
# https://unicode.org/reports/tr51/#Flags
# 'While the syntax of a well-formed emoji flag sequence is defined in ED-14, only valid sequences are displayed as flags
# by conformant implementations, where: The valid region sequences are specified by Unicode region subtags as defined in
# [CLDR], with idStatus=regular, deprecated, or macroregion. For macroregions, only UN and EU are valid.'
# idStatus='deprecated' isn't relevant for countrycode because they can be many-to-one.
# idStatus='macrocode' isn't relevant for countrycode because countrycode doesn't track supranational bodies.

tar.url <- 'https://github.com/unicode-org/cldr/archive/latest.tar.gz'
region.file <- 'cldr-latest/common/validity/region.xml'
xpath <- '/supplementalData/idValidity/id[@type="region"][@idStatus="regular"]'

# Exceptional reservations are valid, but aren't entities in countrycode, so track them for exclusion.
# TODO: This should not be hardcoded. Ideally it would have a scraper similar to get_iso.R.

exceptional.reservations <- c('AC','CP','DG','EA','IC','TA')

# Functions to expand string ranges
# https://www.unicode.org/reports/tr35/#String_Range

letter.pair.to.integer <- function(pair) (match(stringr::str_sub(pair, 1, 1), LETTERS) - 1) * 26 + match(stringr::str_sub(pair, 2, 2), LETTERS) - 1

integer.to.letter.pair <- function(int) paste0(LETTERS[int %/% 26 + 1], LETTERS[int %% 26 + 1])

string.range.expand <- function(string.range) {
stringr::str_replace_all(string.range, '([:upper:])([:upper:])~([:upper:])', '\\1\\2:\\1\\3') %>%
stringr::str_replace_all('([:upper:][:upper:])', function (x) {letter.pair.to.integer(x) %>% as.character()}) %>%
stringr::str_split('[:space:]') %>%
unlist() %>%
purrr::map(~ parse(text=.x) %>% eval()) %>%
unlist() %>%
stringr::str_replace_all('([:digit:]+)', function (x) {as.integer(x) %>% integer.to.letter.pair()})
}

# Get valid regions

tarfile <- tempfile(fileext = '.tar.gz')
httr::GET(tar.url, httr::write_disk(tarfile))
utils::untar(tarfile, files = region.file, exdir = tempdir())
valid.regions <- xml2::read_xml(paste(tempdir(),region.file,sep='/')) %>%
xml2::xml_find_first(xpath) %>%
xml2::xml_text() %>%
string.range.expand()

# Get ISO 3166-1 alpha-2 codes

iso <- read_csv('dictionary/data_iso.csv', col_types = cols(), progress = FALSE) %>%
dplyr::mutate(iso2c = dplyr::if_else(country == 'Namibia', 'NA', iso2c))

# Check that all ISO 3166-1 alpha-2 codes in countrycode have the corresponding Unicode region subtag.

assertthat::assert_that(all(iso$iso2c %in% valid.regions))

# Associate regions with country names by comparing to ISO 3166-1 alpha-2.
# Note that Unicode CLDR assigns XK to Kosovo.

unicode.symbol <- iso %>%
dplyr::mutate(unicode.symbol = dplyr::if_else(iso2c %in% valid.regions, iso2c, NA_character_)) %>%
tibble::add_row(unicode.symbol = 'XK', country = 'Kosovo')

# Check that all valid Unicode region subtags have been assigned in countrycode.

assertthat::assert_that(all(valid.regions %in% c(unicode.symbol$unicode.symbol,exceptional.reservations)))

# Express region subtags as Regional Indicator Symbols and generate the CSV.

unicode.symbol %>%
rowwise %>%
mutate(unicode.symbol = utf8ToInt(unicode.symbol) %>%
`-`(65) %>% # Decimal 65 (Hex 41) is the Unicode codepoint for 'LATIN CAPITAL LETTER A'
`+`(127462) %>% # Decimal 127462 (Hex 1F1E6) is the Unicode codepoint for 'REGIONAL INDICATOR SYMBOL LETTER A'
intToUtf8) %>%
dplyr::select(c('unicode.symbol','country')) %>%
write_csv('dictionary/data_unicode_symbol.csv')
16 changes: 16 additions & 0 deletions tests/testthat/test-unicode-symbols.R
@@ -0,0 +1,16 @@
context('Emoji flags')

test_that('converting to and from emoji works', {
expect_equal(countrycode('Antarctica','country.name','unicode.symbol'), '🇦🇶')
expect_equal(countrycode('🇦🇶','unicode.symbol','country.name'), 'Antarctica')
})

test_that('unicode.symbol-to-country.name-to-unicode.symbol is internally consistent', {
for(unicode.symbol.original in codelist$unicode.symbol){
if(!is.na(unicode.symbol.original)){
name <- countrycode(unicode.symbol.original, 'unicode.symbol', 'country.name')
unicode.symbol.result <- countrycode(name, 'country.name', 'unicode.symbol')
expect_equal(unicode.symbol.result, unicode.symbol.original)
}
}
})

0 comments on commit 06e9b01

Please sign in to comment.