In [65]:
options(scipen=999) # Removes scientific notation (useful for when we want readable plots)
options(repr.plot.width=25, repr.plot.height=15) # Make plots larger in Jupyter output
library(tidyverse) #The mother of all analysis packages
library(rvest) #Scraping web pages
library(httr) #Lets us use requests for data from APIs
library(tidyr) #Functions to help keep our data tidy
library(magrittr) #Piping
library(purrr) #Useful functions for general df wrangling, especially for functions
library(glue) #Gives us useful functions for strings
library(stringr) #More useful functions for strings
library(polite) #For web scraping
library(xml2) #Allows us to work with xml files obtained through scraping
library(GGally) #Produces handy scatterplot matrix with correlations between columns in a dataframe
library(leaps) #Contains regsubsets() to help us select predictors for our model
library(car) #Contains AIC, which we're using for model analysis
library("jsonlite") #Gives functions to help us work with json from API request content
library(plotly) #Allows us to develop interactive graphs - compatible with ggplot2
library(ggthemr) #Extra nice-looking themes for ggplot2 graphs
library(tm) #Contains the function which allows us to remove numbers from a string
set.seed(99670210) #Sets a seed (Luka's student ID) for our samples, so the same sample is taken every time.

In [66]:
codes_url = 'https://www.worlddata.info/countrycodes.php'
code_page = read_html(codes_url) #Reads in the html body from the webpage
code_page %>% glimpse() #Double check that the body has read in correctly (we look at the classes of the objects)

countries_url = 'http://www.energybc.ca/cache/nuclear/nuclear2/www.oecd.org/document/1/0,2340,en_2649_201185_1889402_1_1_1_1,00.html'
countries_page = read_html(countries_url) #Reads in the html body from the webpage
countries_page %>% glimpse() #Double check that the body has read in correctly (we look at the classes of the objects)

List of 2
 $ node:<externalptr> 
 $ doc :<externalptr> 
 - attr(*, "class")= chr [1:2] "xml_document" "xml_node"
List of 2
 $ node:<externalptr> 
 $ doc :<externalptr> 
 - attr(*, "class")= chr [1:2] "xml_document" "xml_node"


In [67]:
valid_countries = countries_page %>% html_nodes(xpath = '//*[@class="more"]') %>% html_text()
valid_countries = valid_countries[1:length(valid_countries)-1]
valid_countries

In [68]:
all_codes = code_page %>% html_nodes(xpath = '//*[@class="std100 hover"]') %>% html_text()
country_codes = unlist(strsplit(all_codes, "[.]"))
output = array()
countries = array()
codes = array()
for (country_data in country_codes){
    country_data = substring(country_data, 1, nchar(country_data)-8)
    country_data = substring(country_data, 3, nchar(country_data))
    country_data = removeNumbers(country_data)
    code = substring(country_data, nchar(country_data)-2, nchar(country_data))
    country = substring(country_data, 1, nchar(country_data)-3)
    countries = countries %>% append(country)
    codes = codes %>% append(code)
    }


output = output[4:length(output)]
countries = countries[5:length(countries)-1]
codes = codes[5:length(codes)-1]

In [69]:
countries2 = array()

for (country in countries){
    country = substring(country, 1, nchar(country)-2)
    countries2 = countries2 %>% append(country)
    }

countries2 = countries2[2:length(countries2)]

output_countries = array()
output_codes = array()

In [70]:
for (i in seq.int(1, length(countries2))){
    if (countries2[i] %in% valid_countries){
        output_countries = output_countries %>% append(countries2[i])
        output_codes = output_codes %>% append(codes[i])
        }
    }
output_countries = output_countries[2:length(output_countries)]
output_codes = output_codes[2:length(output_codes)]

country_code_tibble = tibble(Countries = output_countries, Codes = output_codes)
country_code_tibble

Countries,Codes
<chr>,<chr>
Australia,AUS
Austria,AUT
Belgium,BEL
Canada,CAN
Chile,CHL
Denmark,DNK
Estonia,EST
Finland,FIN
France,FRA
Germany,DEU
