# COVID19 data from Johns Hopkins University
Author: [Yunting Chiu](https://www.linkedin.com/in/yuntingchiu)


In [1]:
# Install the libraries
library(tidyverse)
library(tibble)
library(lubridate)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.6     [32m✔[39m [34mdplyr  [39m 1.0.4
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




# Web scraping from JHU
 - [CSSE at Johns Hopkins University](https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series)

In [5]:
url_in <- "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/"
df <- tibble(file_names = c("time_series_covid19_confirmed_global.csv",
                            "time_series_covid19_deaths_global.csv",
                            "time_series_covid19_confirmed_US.csv",
                            "time_series_covid19_deaths_US.csv")) -> df
df %>%
  mutate(url = str_c(url_in, file_names, sep = "")) -> df
df %>%
  mutate(data = map(url, ~read_csv(., na = ""))) -> df
df %>%
  mutate(case_types = as.factor(str_extract(file_names, "[:alpha:]*_[gU][:alpha:]*"))) -> df
df %>%
  select(case_types, data) -> df
df


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  .default = col_double(),
  `Province/State` = [31mcol_character()[39m,
  `Country/Region` = [31mcol_character()[39m
)
[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  .default = col_double(),
  `Province/State` = [31mcol_character()[39m,
  `Country/Region` = [31mcol_character()[39m
)
[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  .default = col_double(),
  iso2 = [31mcol_character()[39m,
  iso3 = [31mcol_character()[39m,
  Admin2 = [31mcol_character()[39m,
  Province_State = [31mcol_charac

# Clean the row data

In [None]:
df %>%
  mutate(vars = map(df$data, names)) -> df

# add the names from each of the four data frames
fix_names <- function(df, pattern, rePattern){
  stopifnot(is.data.frame(df), is.character(pattern), is.character(rePattern))
  names(df) <- str_replace_all(names(df), pattern, rePattern)
  return(df)
}
# adjust the name of each variables
df %>%
  mutate(data = map(data, ~fix_names(., "([ey])/", "\\1_")),
         data = map(data, ~fix_names(., "Admin2", "County")),
         data = map(data, ~fix_names(., "Long_", "Long")),
         data = map_if(data, str_detect(df$case_types, "US"),
                   ~select(., -c("UID", "iso2", "iso3", 
                                 "code3", "FIPS", "Combined_Key"))),
         data = map_if(data, str_detect(df$case_types, "global"),
                      ~mutate(., County = "NA")),
         data = map_if(data, !str_detect(df$case_types, "deaths_US"),
                      ~mutate(., Population = 0)),
         data = map(data, ~unite(., "Country_State", 
                                 c("Country_Region", "Province_State"),
                                 remove = FALSE, na.rm = TRUE,
                                 sep = "_"))
         ) -> df

# synchronize the vars correspondingly
df %>%
  mutate(vars = map(df$data, names)) -> df # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking 

ERROR: ignored

# Tidy data

In [None]:
df %>%
  mutate(data = map(data, ~pivot_longer(data = ., cols = contains("/"),
                                        names_to = "Date",
                                        values_to = "dailyValues",
                                        names_transform = list(Date = mdy)))
         ) -> df
# df$data <- map(df$data, names) # synchronize the vars correspondingly
# map(df$vars, ~unlist(.)) # for checking 

# crate a function to fix in type of Date
mdyDate <- function(df, varsDate){
  # stopifnot(is.data.frame(df), is.character(varsDate))
  df[[varsDate]] <- ymd(df[[varsDate]])
  return(df)
}

df %>%
  mutate(data = map(data, ~mdyDate(., "Date"))) -> df_long

# str(df_long) # check the data set

ERROR: ignored

# Add Continents - using countrycode from [here](https://github.com/vincentarelbundock/countrycode/blob/main/R/countrycode.R)

In [None]:
#' Convert Country Codes
#'
#' Converts long country names into one of many different coding schemes.
#' Translates from one scheme to another. Converts country name or coding
#' scheme to the official short English country name. Creates a new variable
#' with the name of the continent or region to which each country belongs.
#'
#' @param sourcevar Vector which contains the codes or country names to be
#' converted (character or factor)
#' @param origin Coding scheme of origin (string such as "iso3c" enclosed in
#' quotes ""): type "?codelist" for a list of available codes.
#' @param destination Coding scheme of destination (string such as "iso3c"
#' enclosed in quotes ""): type `?codelist` for a list of
#' available codes.
#' @param warn Prints unique elements from sourcevar for which no match was found
#' @param nomatch When countrycode fails to find a match for the code of
#' origin, it fills-in the destination vector with nomatch. The default
#' behavior is to fill non-matching codes with NA. If nomatch = NULL,
#' countrycode tries to use the origin vector to fill-in missing values in the
#' destination vector. nomatch must be either NULL, of length 1, or of the same
#' length as sourcevar.
#' @param custom_dict A data frame which supplies a new dictionary to replace
#' the built-in country code dictionary. Each column contains a different code
#' and must include no duplicates. The data frame format should resemble
#' `countrycode::codelist`.  Warning: when `custom_dict` is used, no sanity
#' checks are conducted.
#' @param custom_match A named vector which supplies custom origin and
#' destination matches that will supercede any matching default result. The name
#' of each element will be used as the origin code, and the value of each
#' element will be used as the destination code.
#' @param origin_regex Logical: When using a custom dictionary, if TRUE then the
#' origin codes will be matched as regex, if FALSE they will be matched exactly.
#' When using the default dictionary (dictionary = NULL), origin_regex will be ignored.
#' @keywords countrycode
#' @note For a complete description of available country codes and languages,
#' please read the documentation for the \code{codelist} conversion
#' dictionary.  Type: \code{?codelist}.
#' @note Panel data (i.e., country-year) can pose particular problems when
#' converting codes. For instance, some countries like Vietnam or Serbia go
#' through political transitions that justify changing codes over time. This
#' can pose problems when using codes from organizations like CoW or Polity IV,
#' which produce codes in country-year format. Instead of converting codes
#' using the `countrycode` function, we recommend that users use the
#' ``countrycode::codelist_panel`` data.frame as a base into which they can
#' merge their other data. This data.frame includes most relevant code, and is
#' already "reconciled" to ensure that each political unit is only represented
#' by one row in any given year. From there, it is just a matter of using `R`'s
#' `merge` function to combine different datasets which use different codes.
#'
#' @export
#' @aliases countrycode
#' @examples
#' library(countrycode)
#'
#' # ISO to Correlates of War
#' countrycode(c('USA', 'DZA'), origin = 'iso3c', destination = 'cown')
#'
#' # English to ISO
#' countrycode('Albania', origin = 'country.name', destination = 'iso3c')
#'
#' # German to French
#' countrycode('Albanien', origin = 'country.name.de', destination = 'iso.name.fr')
#'
#' # Using custom_match to supercede default codes
#' countrycode(c('United States', 'Algeria'), 'country.name', 'iso3c')
#' countrycode(c('United States', 'Algeria'), 'country.name', 'iso3c',
#'             custom_match = c('Algeria' = 'ALG'))
#'
#' \dontrun{
#' # Using `custom_dict` to convert US States names. This dictionary is
#' hosted on github. We use a shortened URL to load it.
#' cd <- 'https://bit.ly/2ToSrFv'
#' cd <- read.csv(cd)
#' countrycode(c('AL', 'AK'), 'abbreviation', 'state', 
#'             custom_dict = cd)
#' countrycode(c('Alabama', 'North Dakota'), 'state.regex', 'state',
#'             custom_dict = cd, origin_regex = TRUE)
#' }

countrycode <- function(sourcevar, origin, destination, warn = TRUE, nomatch = NA,
                        custom_dict = NULL, custom_match = NULL, origin_regex = FALSE) {

    # Regex naming scheme
    if (is.null(custom_dict)) { # only for default dictionary
        # English regex is default
        if (origin == 'country.name') {
            origin <- 'country.name.en'
        }
        if (destination == 'country.name') {
            destination <- 'country.name.en'
        }
        # .regex extension in dictionary colnames
        if (origin %in% c('country.name.en', 'country.name.de')) {
            origin <- paste0(origin, '.regex')
            origin_regex <- TRUE
        } else {
            origin_regex <- FALSE
        }
    }

    # Set conversion dictionary
    if (!is.null(custom_dict)) {
        dictionary <- custom_dict
        valid_origin <- colnames(dictionary)
        valid_destination <- colnames(dictionary)
    } else {
        dictionary = countrycode::codelist
        # Modify this manually when adding codes
        valid_origin = c("cctld", "country.name", "country.name.de", "cowc", "cown", "dhs",
                         "ecb", "eurostat", "fao", "fips", "gaul", "genc2c",
                         "genc3c", "genc3n", "gwc", "gwn", "imf", "ioc", "iso2c", "iso3c",
                         "iso3n", "p4c", "p4n", "un", "un_m49", "unicode.symbol", "unpd",
                         "vdem", "wb", "wb_api2c", "wb_api3c", "wvs",
                         "country.name.en.regex", "country.name.de.regex")
        valid_destination <- colnames(dictionary)
    }

    # Allow tibbles as conversion dictionary
    if('tbl_df' %in% class(dictionary)){ # allow tibble
        dictionary <- as.data.frame(dictionary)
    }

    # Sanity checks
    if (missing(sourcevar)) {
        stop('sourcevar is NULL (does not exist).')
    }

    if (!mode(sourcevar) %in% c('character', 'numeric')) {
        stop('sourcevar must be a character or numeric vector. This error often
             arises when users pass a tibble (e.g., from dplyr) instead of a
             column vector from a data.frame (i.e., my_tbl[, 2] vs. my_df[, 2]
                                              vs. my_tbl[[2]])')
    }

    if (!is.null(nomatch) & (length(nomatch) != 1) & (length(nomatch) != length(sourcevar))) {
        stop('nomatch needs to be NULL, or of length 1 or ', length(sourcevar), '.')
    }

    if (!origin %in% valid_origin) {
        stop('Origin code not supported by countrycode or present in the user-supplied custom_dict.')
    }

    if (!destination %in% valid_destination) {
        stop('Destination code not supported by countrycode or present in the user-supplied custom_dict.')
    }

    if(!inherits(dictionary, "data.frame")) {
        stop("Dictionary must be a data frame or tibble with codes as columns.")
    }

    if(!destination %in% colnames(dictionary)){
        stop("Destination code must correpond to a column name in the dictionary data frame.")
    }

    dups = any(duplicated(stats::na.omit(dictionary[, origin])))
    if(dups){
        stop("Countrycode cannot accept dictionaries with duplicated origin codes")
    }

    # Copy origin_vector for later re-use
    origin_vector <- sourcevar

    # Case-insensitive matching
    if(is.null(custom_dict)){ # only for built-in dictionary
        if(inherits(origin_vector, 'character') & !grepl('country', origin)){
            origin_vector = toupper(origin_vector)
        }
    }

    # Convert
    if (origin_regex) { # regex codes
        dict <- stats::na.omit(dictionary[, c(origin, destination)])
        sourcefctr <- factor(origin_vector)

        # match levels of sourcefctr
        matches <-
          sapply(c(levels(sourcefctr), NA), function(x) { # add NA so there's at least one item
            x <- trimws(x)
            matchidx <- sapply(dict[[origin]], function(y) grepl(y, x, perl = TRUE, ignore.case = TRUE))
            dict[matchidx, destination]
          })

        # fill elements that have zero matches with the appropriate NA
        matches[sapply(matches, length) == 0] <- `class<-`(NA, class(dict[[destination]]))

        # create destination_list with elements that have more than one match
        destination_list <- matches[sapply(matches, length) > 1]

        # add origin_vector value to beginning of match results to replicate previous behavior
        destination_list <- Map(c, names(destination_list), destination_list)

        # set elements with multiple matches to the appropriate NA
        matches[sapply(matches, length) > 1] <- `class<-`(NA, class(dict[[destination]]))

        # remove all but last match to replicate previous behavior
        matches <- sapply(matches, function(x) { x[length(x)] })

        # replace with custom matches if set
        if (!is.null(custom_match)) {
          matchidxs <- match(names(matches), names(custom_match))
          cust_matched <- !is.na(matchidxs)
          matches[cust_matched] <- custom_match[matchidxs][cust_matched]
        }

        # apply new levels to sourcefctr and unname
        destination_vector <- unname(matches[as.numeric(sourcefctr)])

    } else { # non-regex codes
        dict <- stats::na.omit(dictionary[, c(origin, destination)])
        sourcefctr <- factor(origin_vector)

        # match levels of sourcefctr
        matchidxs <- match(levels(sourcefctr), dict[[origin]])
        matches <- dict[[destination]][matchidxs]

        # replace with custom matches if set
        if (!is.null(custom_match)) {
          matchidxs <- match(levels(sourcefctr), names(custom_match))
          cust_matched <- !is.na(matchidxs)
          matches[cust_matched] <- custom_match[matchidxs][cust_matched]
        }

        # apply new levels to sourcefctr
        destination_vector <- matches[as.numeric(sourcefctr)]
    }

    # Filling-in failed matches
    sane_sourcevar <- class(sourcevar)[1] == class(destination_vector)[1]
    sane_nomatch <- class(nomatch)[1] == class(destination_vector)[1]
    idx <- is.na(destination_vector)
    if (is.null(nomatch)) {
        if (sane_sourcevar) {
            destination_vector[idx] <- sourcevar[idx]
        } else if (class(sourcevar)[1] == "factor" & class(destination_vector)[1] == "character") {
            destination_vector[idx] <- as.character(sourcevar[idx])
        } else {
            warning("The origin and destination codes are not of the same
                    class. Filling-in bad matches with NA instead.")
        }
    } else if ((length(nomatch) == 1) & is.na(nomatch)) { # NA
    } else if ((length(nomatch) == 1) & sane_nomatch) { # single replacement
        destination_vector[idx] <- nomatch
    } else if ((length(nomatch) == length(sourcevar)) & sane_sourcevar) { # vector replacement
        destination_vector[idx] <- nomatch[idx]
    } else {
        warning("The argument `nomatch` must be NULL, NA, or of the same class
                as the destination vector. Filling-in bad matches with NA instead.")
    }

    # Warnings
    if(warn){
        badmatch <- sort(unique(origin_vector[is.na(destination_vector)]))
        badmatch <- badmatch[!badmatch %in% names(custom_match)]  # do not report <NA>'s that were set explicitly by custom_match
        if(length(badmatch) > 0){
            warning("Some values were not matched unambiguously: ", paste(badmatch, collapse=", "), "\n")
        }
        if(origin_regex){
           if(length(destination_list) > 0){
               destination_list <- lapply(destination_list, function(k) paste(k, collapse=','))
               destination_list <- sort(unique(do.call('c', destination_list)))
               warning("Some strings were matched more than once, and therefore set to <NA> in the result: ", paste(destination_list, collapse="; "), "\n")
           }
        }
    }
    return(destination_vector)
}

In [None]:
df_long %>%
  mutate(data = map(data, ~mutate(., Continent = countrycode(Country_Region,
                                               origin = "country.name",
                                               destination = "continent")))
         ) -> df_long
df_long %>%
  mutate(data = map(data, ~mutate(., Continent = case_when(
                                               Country_Region == "Diamond Princess" ~ "Asia",
                                               Country_Region == "Kosovo" ~ "Americas",
                                               Country_Region == "MS Zaandam" ~ "Europe",
                                               TRUE ~ Continent)
                                  ))) -> df_long

map(df_long$data, ~unique(.$Continent))

ERROR: ignored