Merge pull request #225 from NilsEnevoldsen/emojiflags

Add Unicode symbols (emoji flags)
vincentarelbundock · May 11, 2020 · 06e9b01 · 06e9b01
2 parents 0e41a9f + fc8a78f
commit 06e9b01
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 1 deletion.
diff --git a/R/codelist.R b/R/codelist.R
@@ -28,6 +28,7 @@
 #'   \item p4n: Polity IV numeric country code
 #'   \item p4c: Polity IV character country code
 #'   \item un: United Nations M49 numeric codes
+#'   \item unicode.symbol: Region subtag (often displayed as emoji flag)
 #'   \item unpd: United Nations Procurement Division
 #'   \item vdem: Varieties of Democracy (V-Dem version 8, April 2018)
 #'   \item wb: World Bank (very similar but not identical to iso3c)

diff --git a/R/countrycode.R b/R/countrycode.R
@@ -88,7 +88,7 @@ countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = N
         valid_origin = c("country.name", "country.name.de", "cowc", "cown", "dhs",
                          "ecb", "eurostat", "fao", "fips", "gaul", "genc2c",
                          "genc3c", "genc3n", "gwc", "gwn", "imf", "ioc", "iso2c", "iso3c",
-                         "iso3n", "p4c", "p4n", "un", "un_m49", "unpd",
+                         "iso3n", "p4c", "p4n", "un", "un_m49", "unicode.symbol", "unpd",
                          "vdem", "wb", "wb_api2c", "wb_api3c", "wvs",
                          "country.name.en.regex", "country.name.de.regex")
         valid_destination <- colnames(dictionary)

diff --git a/data/codelist.rda b/data/codelist.rda
diff --git a/data/codelist_panel.rda b/data/codelist_panel.rda
diff --git a/dictionary/data_unicode_symbol.csv b/dictionary/data_unicode_symbol.csv
@@ -0,0 +1,251 @@
+unicode.symbol,country
+🇦🇫,Afghanistan
+🇦🇱,Albania
+🇩🇿,Algeria
+🇦🇸,American Samoa
+🇦🇩,Andorra
+🇦🇴,Angola
+🇦🇮,Anguilla
+🇦🇶,Antarctica
+🇦🇬,Antigua and Barbuda
+🇦🇷,Argentina
+🇦🇲,Armenia
+🇦🇼,Aruba
+🇦🇺,Australia
+🇦🇹,Austria
+🇦🇿,Azerbaijan
+🇧🇸,Bahamas (the)
+🇧🇭,Bahrain
+🇧🇩,Bangladesh
+🇧🇧,Barbados
+🇧🇾,Belarus
+🇧🇪,Belgium
+🇧🇿,Belize
+🇧🇯,Benin
+🇧🇲,Bermuda
+🇧🇹,Bhutan
+🇧🇴,Bolivia (Plurinational State of)
+🇧🇶,"Bonaire, Sint Eustatius and Saba"
+🇧🇦,Bosnia and Herzegovina
+🇧🇼,Botswana
+🇧🇻,Bouvet Island
+🇧🇷,Brazil
+🇮🇴,British Indian Ocean Territory (the)
+🇧🇳,Brunei Darussalam
+🇧🇬,Bulgaria
+🇧🇫,Burkina Faso
+🇧🇮,Burundi
+🇨🇻,Cabo Verde
+🇰🇭,Cambodia
+🇨🇲,Cameroon
+🇨🇦,Canada
+🇰🇾,Cayman Islands (the)
+🇨🇫,Central African Republic (the)
+🇹🇩,Chad
+🇨🇱,Chile
+🇨🇳,China
+🇨🇽,Christmas Island
+🇨🇨,Cocos (Keeling) Islands (the)
+🇨🇴,Colombia
+🇰🇲,Comoros (the)
+🇨🇩,Congo (the Democratic Republic of the)
+🇨🇬,Congo (the)
+🇨🇰,Cook Islands (the)
+🇨🇷,Costa Rica
+🇭🇷,Croatia
+🇨🇺,Cuba
+🇨🇼,Curaçao
+🇨🇾,Cyprus
+🇨🇿,Czechia
+🇨🇮,Côte d'Ivoire
+🇩🇰,Denmark
+🇩🇯,Djibouti
+🇩🇲,Dominica
+🇩🇴,Dominican Republic (the)
+🇪🇨,Ecuador
+🇪🇬,Egypt
+🇸🇻,El Salvador
+🇬🇶,Equatorial Guinea
+🇪🇷,Eritrea
+🇪🇪,Estonia
+🇸🇿,Eswatini
+🇪🇹,Ethiopia
+🇫🇰,Falkland Islands (the) [Malvinas]
+🇫🇴,Faroe Islands (the)
+🇫🇯,Fiji
+🇫🇮,Finland
+🇫🇷,France
+🇬🇫,French Guiana
+🇵🇫,French Polynesia
+🇹🇫,French Southern Territories (the)
+🇬🇦,Gabon
+🇬🇲,Gambia (the)
+🇬🇪,Georgia
+🇩🇪,Germany
+🇬🇭,Ghana
+🇬🇮,Gibraltar
+🇬🇷,Greece
+🇬🇱,Greenland
+🇬🇩,Grenada
+🇬🇵,Guadeloupe
+🇬🇺,Guam
+🇬🇹,Guatemala
+🇬🇬,Guernsey
+🇬🇳,Guinea
+🇬🇼,Guinea-Bissau
+🇬🇾,Guyana
+🇭🇹,Haiti
+🇭🇲,Heard Island and McDonald Islands
+🇻🇦,Holy See (the)
+🇭🇳,Honduras
+🇭🇰,Hong Kong
+🇭🇺,Hungary
+🇮🇸,Iceland
+🇮🇳,India
+🇮🇩,Indonesia
+🇮🇷,Iran (Islamic Republic of)
+🇮🇶,Iraq
+🇮🇪,Ireland
+🇮🇲,Isle of Man
+🇮🇱,Israel
+🇮🇹,Italy
+🇯🇲,Jamaica
+🇯🇵,Japan
+🇯🇪,Jersey
+🇯🇴,Jordan
+🇰🇿,Kazakhstan
+🇰🇪,Kenya
+🇰🇮,Kiribati
+🇰🇵,Korea (the Democratic People's Republic of)
+🇰🇷,Korea (the Republic of)
+🇰🇼,Kuwait
+🇰🇬,Kyrgyzstan
+🇱🇦,Lao People's Democratic Republic (the)
+🇱🇻,Latvia
+🇱🇧,Lebanon
+🇱🇸,Lesotho
+🇱🇷,Liberia
+🇱🇾,Libya
+🇱🇮,Liechtenstein
+🇱🇹,Lithuania
+🇱🇺,Luxembourg
+🇲🇴,Macao
+🇲🇬,Madagascar
+🇲🇼,Malawi
+🇲🇾,Malaysia
+🇲🇻,Maldives
+🇲🇱,Mali
+🇲🇹,Malta
+🇲🇭,Marshall Islands (the)
+🇲🇶,Martinique
+🇲🇷,Mauritania
+🇲🇺,Mauritius
+🇾🇹,Mayotte
+🇲🇽,Mexico
+🇫🇲,Micronesia (Federated States of)
+🇲🇩,Moldova (the Republic of)
+🇲🇨,Monaco
+🇲🇳,Mongolia
+🇲🇪,Montenegro
+🇲🇸,Montserrat
+🇲🇦,Morocco
+🇲🇿,Mozambique
+🇲🇲,Myanmar
+🇳🇦,Namibia
+🇳🇷,Nauru
+🇳🇵,Nepal
+🇳🇱,Netherlands (the)
+🇳🇨,New Caledonia
+🇳🇿,New Zealand
+🇳🇮,Nicaragua
+🇳🇪,Niger (the)
+🇳🇬,Nigeria
+🇳🇺,Niue
+🇳🇫,Norfolk Island
+🇲🇰,North Macedonia
+🇲🇵,Northern Mariana Islands (the)
+🇳🇴,Norway
+🇴🇲,Oman
+🇵🇰,Pakistan
+🇵🇼,Palau
+🇵🇸,"Palestine, State of"
+🇵🇦,Panama
+🇵🇬,Papua New Guinea
+🇵🇾,Paraguay
+🇵🇪,Peru
+🇵🇭,Philippines (the)
+🇵🇳,Pitcairn
+🇵🇱,Poland
+🇵🇹,Portugal
+🇵🇷,Puerto Rico
+🇶🇦,Qatar
+🇷🇴,Romania
+🇷🇺,Russian Federation (the)
+🇷🇼,Rwanda
+🇷🇪,Réunion
+🇧🇱,Saint Barthélemy
+🇸🇭,"Saint Helena, Ascension and Tristan da Cunha"
+🇰🇳,Saint Kitts and Nevis
+🇱🇨,Saint Lucia
+🇲🇫,Saint Martin (French part)
+🇵🇲,Saint Pierre and Miquelon
+🇻🇨,Saint Vincent and the Grenadines
+🇼🇸,Samoa
+🇸🇲,San Marino
+🇸🇹,Sao Tome and Principe
+🇸🇦,Saudi Arabia
+🇸🇳,Senegal
+🇷🇸,Serbia
+🇸🇨,Seychelles
+🇸🇱,Sierra Leone
+🇸🇬,Singapore
+🇸🇽,Sint Maarten (Dutch part)
+🇸🇰,Slovakia
+🇸🇮,Slovenia
+🇸🇧,Solomon Islands
+🇸🇴,Somalia
+🇿🇦,South Africa
+🇬🇸,South Georgia and the South Sandwich Islands
+🇸🇸,South Sudan
+🇪🇸,Spain
+🇱🇰,Sri Lanka
+🇸🇩,Sudan (the)
+🇸🇷,Suriname
+🇸🇯,Svalbard and Jan Mayen
+🇸🇪,Sweden
+🇨🇭,Switzerland
+🇸🇾,Syrian Arab Republic (the)
+🇹🇼,Taiwan (Province of China)
+🇹🇯,Tajikistan
+🇹🇿,"Tanzania, the United Republic of"
+🇹🇭,Thailand
+🇹🇱,Timor-Leste
+🇹🇬,Togo
+🇹🇰,Tokelau
+🇹🇴,Tonga
+🇹🇹,Trinidad and Tobago
+🇹🇳,Tunisia
+🇹🇷,Turkey
+🇹🇲,Turkmenistan
+🇹🇨,Turks and Caicos Islands (the)
+🇹🇻,Tuvalu
+🇺🇬,Uganda
+🇺🇦,Ukraine
+🇦🇪,United Arab Emirates (the)
+🇬🇧,United Kingdom
+🇺🇲,United States Minor Outlying Islands (the)
+🇺🇸,United States of America (the)
+🇺🇾,Uruguay
+🇺🇿,Uzbekistan
+🇻🇺,Vanuatu
+🇻🇪,Venezuela (Bolivarian Republic of)
+🇻🇳,Viet Nam
+🇻🇬,Virgin Islands (British)
+🇻🇮,Virgin Islands (U.S.)
+🇼🇫,Wallis and Futuna
+🇪🇭,Western Sahara
+🇾🇪,Yemen
+🇿🇲,Zambia
+🇿🇼,Zimbabwe
+🇦🇽,Åland Islands
+🇽🇰,Kosovo
diff --git a/dictionary/get_unicode_symbol.R b/dictionary/get_unicode_symbol.R
@@ -0,0 +1,87 @@
+require(dplyr)
+require(tibble)
+require(purrr)
+require(stringr)
+require(httr)
+require(xml2)
+require(assertthat)
+
+# This script creates Unicode region subtags as pairs of Regional Indicator Symbols. Region subtags represent regions,
+# not flags. When composed of Regional Indicator Symbols, they are displayed as flags on most systems, but on Windows
+# they are displayed as pairs of letters.
+# https://en.wikipedia.org/wiki/Regional_Indicator_Symbol
+
+# The location specified below lists valid Unicode region subtags. They are based on ISO 3166-1 alpha-2.
+# https://unicode.org/reports/tr51/#Flags
+# 'While the syntax of a well-formed emoji flag sequence is defined in ED-14, only valid sequences are displayed as flags
+# by conformant implementations, where: The valid region sequences are specified by Unicode region subtags as defined in
+# [CLDR], with idStatus=regular, deprecated, or macroregion. For macroregions, only UN and EU are valid.'
+# idStatus='deprecated' isn't relevant for countrycode because they can be many-to-one.
+# idStatus='macrocode' isn't relevant for countrycode because countrycode doesn't track supranational bodies.
+
+tar.url <- 'https://github.com/unicode-org/cldr/archive/latest.tar.gz'
+region.file <- 'cldr-latest/common/validity/region.xml'
+xpath <- '/supplementalData/idValidity/id[@type="region"][@idStatus="regular"]'
+
+# Exceptional reservations are valid, but aren't entities in countrycode, so track them for exclusion.
+# TODO: This should not be hardcoded. Ideally it would have a scraper similar to get_iso.R.
+
+exceptional.reservations <- c('AC','CP','DG','EA','IC','TA')
+
+# Functions to expand string ranges
+# https://www.unicode.org/reports/tr35/#String_Range
+
+letter.pair.to.integer <- function(pair) (match(stringr::str_sub(pair, 1, 1), LETTERS) - 1) * 26 + match(stringr::str_sub(pair, 2, 2), LETTERS) - 1
+
+integer.to.letter.pair <- function(int) paste0(LETTERS[int %/% 26 + 1], LETTERS[int %% 26 + 1])
+
+string.range.expand <- function(string.range) {
+    stringr::str_replace_all(string.range, '([:upper:])([:upper:])~([:upper:])', '\\1\\2:\\1\\3') %>%
+    stringr::str_replace_all('([:upper:][:upper:])', function (x) {letter.pair.to.integer(x) %>% as.character()}) %>%
+    stringr::str_split('[:space:]') %>%
+    unlist() %>%
+    purrr::map(~ parse(text=.x) %>% eval()) %>%
+    unlist() %>%
+    stringr::str_replace_all('([:digit:]+)', function (x) {as.integer(x) %>% integer.to.letter.pair()})
+}
+
+# Get valid regions
+
+tarfile <- tempfile(fileext = '.tar.gz')
+httr::GET(tar.url, httr::write_disk(tarfile))
+utils::untar(tarfile, files = region.file, exdir = tempdir())
+valid.regions <- xml2::read_xml(paste(tempdir(),region.file,sep='/')) %>%
+  xml2::xml_find_first(xpath) %>%
+  xml2::xml_text() %>%
+  string.range.expand()
+
+# Get ISO 3166-1 alpha-2 codes
+
+iso <- read_csv('dictionary/data_iso.csv', col_types = cols(), progress = FALSE) %>%
+  dplyr::mutate(iso2c = dplyr::if_else(country == 'Namibia', 'NA', iso2c))
+
+# Check that all ISO 3166-1 alpha-2 codes in countrycode have the corresponding Unicode region subtag.
+
+assertthat::assert_that(all(iso$iso2c %in% valid.regions))
+
+# Associate regions with country names by comparing to ISO 3166-1 alpha-2.
+# Note that Unicode CLDR assigns XK to Kosovo.
+
+unicode.symbol <- iso  %>%
+  dplyr::mutate(unicode.symbol = dplyr::if_else(iso2c %in% valid.regions, iso2c, NA_character_)) %>%
+  tibble::add_row(unicode.symbol = 'XK', country = 'Kosovo')
+
+# Check that all valid Unicode region subtags have been assigned in countrycode.
+
+assertthat::assert_that(all(valid.regions %in% c(unicode.symbol$unicode.symbol,exceptional.reservations)))
+
+# Express region subtags as Regional Indicator Symbols and generate the CSV.
+
+unicode.symbol %>%
+  rowwise %>%
+  mutate(unicode.symbol = utf8ToInt(unicode.symbol) %>%
+           `-`(65) %>% # Decimal 65 (Hex 41) is the Unicode codepoint for 'LATIN CAPITAL LETTER A'
+           `+`(127462) %>% # Decimal 127462 (Hex 1F1E6) is the Unicode codepoint for 'REGIONAL INDICATOR SYMBOL LETTER A'
+           intToUtf8) %>%
+  dplyr::select(c('unicode.symbol','country')) %>%
+  write_csv('dictionary/data_unicode_symbol.csv')
diff --git a/tests/testthat/test-unicode-symbols.R b/tests/testthat/test-unicode-symbols.R
@@ -0,0 +1,16 @@
+context('Emoji flags')
+
+test_that('converting to and from emoji works', {
+    expect_equal(countrycode('Antarctica','country.name','unicode.symbol'), '🇦🇶')
+    expect_equal(countrycode('🇦🇶','unicode.symbol','country.name'), 'Antarctica')
+})
+
+test_that('unicode.symbol-to-country.name-to-unicode.symbol is internally consistent', {
+    for(unicode.symbol.original in codelist$unicode.symbol){
+        if(!is.na(unicode.symbol.original)){
+            name <- countrycode(unicode.symbol.original, 'unicode.symbol', 'country.name')
+            unicode.symbol.result <- countrycode(name, 'country.name', 'unicode.symbol')
+            expect_equal(unicode.symbol.result, unicode.symbol.original)
+        }
+    }
+})