In [1]:
# required libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

In [2]:
# importing untidy data set
url <- "https://docs.google.com/spreadsheets/d/e/2PACX-1vR_6ci0B-tyn-2T5ywr5cOSxgtls94IPi3zrpu5z3fNxDyK8R1985IH1Uq3zd253ufZIr8Y1tgxffnq/pub?output=csv"

countries_raw_data <- read_csv(url)

#the following steps aim to tidy the data set


#renaming columns to make them syntactically valid

colnames(countries_raw_data) <- tolower(make.names(colnames(countries_raw_data)))

Parsed with column specification:
cols(
  .default = col_double(),
  Country = [31mcol_character()[39m,
  Region = [31mcol_character()[39m,
  `GDP per Capita` = [31mcol_character()[39m,
  `Data Quality` = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [6]:
# selecting columns of interest from the data frame; includes the country's name, GDP per Capita, 
# and ecological footprints of interest to our project
countries_GDP_eco <- countries_raw_data %>% select(country, gdp.per.capita, cropland.footprint, 
                                                  grazing.footprint, forest.footprint, 
                                                  fish.footprint, carbon.footprint)

# removing any rows that have an NA cell
countries_GDP_eco <- na.omit(countries_GDP_eco)


# removing the "$" sign and "," from all elements of the gdp.per.capita column
countries_GDP_eco <-  countries_GDP_eco %>% 
                         mutate(gdp.per.capita = gsub("[^[:alnum:][:space:].]", "",
                                                      countries_GDP_eco$gdp.per.capita))

# converting gdp.per.capita from a character data type to double

countries_GDP_eco <- countries_GDP_eco %>% mutate(gdp.per.capita = as.numeric(gdp.per.capita))
countries_GDP_eco

# the following steps are for categorizing the gdp.per.capita column into "High", "Medium", and "low"

# arranging the gdp.per.capita column in an ascending order
countries_GDP_eco <- arrange(countries_GDP_eco, gdp.per.capita)

country,gdp.per.capita,cropland.footprint,grazing.footprint,forest.footprint,fish.footprint,carbon.footprint
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,614.66,0.30,0.20,0.08,0.00,0.18
Albania,4534.37,0.78,0.22,0.25,0.02,0.87
Algeria,5430.57,0.60,0.16,0.17,0.01,1.14
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Yemen,1302.30,0.34,0.14,0.04,0.04,0.42
Zambia,1740.64,0.19,0.18,0.33,0.01,0.24
Zimbabwe,865.91,0.20,0.32,0.29,0.01,0.53


In [7]:
# given that there are about 163 rows, we will categorize the GDP_per_column such that the lowest 54
# values (i.e. 1st to 54th row) are assigned to the "low" category, 
# the next 55 values (i.e. 55th to 109th row) are assigned to the "medium" category
# and the next 54 values (i.e. 110th to 163th row) are assigned to the "high" category 
# so that each category has roughly (163/3) number of elements

# creating a new column named GDP_classified where the corresponding gdp.per.capita is assigned either "low",
# "medium" , or "high"
countries_GDP_eco_classified <- countries_GDP_eco %>% 
    mutate(GDP_classified = cut(countries_GDP_eco$gdp.per.capita,
                                breaks = c(276.68, 2379.44, 10123.9, Inf),
                                labels = c("low", "medium", "high")))

In [8]:
# generating the training and testing data sets. A proportion of 75% was dedicated to the training data set,
# and 25% for the testing data set. This was because our data frame had less than 500 rows.

set.seed(1)
countries_split <- initial_split(countries_GDP_eco_classified, prop = 0.75, strata = GDP_classified)
countries_train <- training(countries_split)
countries_test <- testing(countries_split)

In [12]:
# the following steps give a summary of the data, using only the training set. 

# table giving the means of each of the predictors
countries_train_predictors <- countries_train %>%
    select(cropland.footprint, grazing.footprint, forest.footprint, fish.footprint, carbon.footprint)

mean_of_predictors <- map_df(countries_train_predictors, mean)

mean_of_predictors

cropland.footprint,grazing.footprint,forest.footprint,fish.footprint,carbon.footprint
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
0.5690323,0.2671774,0.3573387,0.1122581,1.814839
