In [24]:
# required libraries
library(tidyverse)
library(repr)
library(tidymodels)
options(repr.matrix.max.rows = 6)

In [25]:
# importing untidy data set
countries_raw_data <- read_csv("countries.csv")

#the folliwning steps aim to tidy the data set


#renaming columns of interest so that they are underline-separated

names(countries_raw_data)[names(countries_raw_data) == "GDP per Capita"] <- "GDP_per_capita"
names(countries_raw_data)[names(countries_raw_data) == "Cropland Footprint"] <- "cropland_footprint"
names(countries_raw_data)[names(countries_raw_data) == "Grazing Footprint"] <- "grazing_footprint"
names(countries_raw_data)[names(countries_raw_data) == "Forest Footprint"] <- "forest_footprint"
names(countries_raw_data)[names(countries_raw_data) == "Fish Footprint"] <- "fish_footprint"

Parsed with column specification:
cols(
  .default = col_double(),
  Country = [31mcol_character()[39m,
  Region = [31mcol_character()[39m,
  `GDP per Capita` = [31mcol_character()[39m,
  `Data Quality` = [31mcol_character()[39m
)

See spec(...) for full column specifications.



In [26]:
# selecting columns of interest from the data frame; includes Country name, GDP per Capit, 
# and ecological footprints of interest to our project
countries_GDP_eco <- countries_raw_data %>% select(Country, GDP_per_capita, cropland_footprint, 
                                                  grazing_footprint, forest_footprint, 
                                                  fish_footprint)

# removing any rows that have an NA cell
countries_GDP_eco <- na.omit(countries_GDP_eco)


# removing the "$" sign and "," from all elements of the GDP_per_capita column
countries_GDP_eco <-  countries_GDP_eco %>% 
                         mutate(GDP_per_capita = gsub("[^[:alnum:][:space:].]", "",
                                                      countries_GDP_eco$GDP_per_capita))

# converting GDP_per_capita from a character data type to double

countries_GDP_eco <- countries_GDP_eco %>% mutate(GDP_per_capita = as.numeric(GDP_per_capita))
countries_GDP_eco

Country,GDP_per_capita,cropland_footprint,grazing_footprint,forest_footprint,fish_footprint
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Afghanistan,614.66,0.30,0.20,0.08,0.00
Albania,4534.37,0.78,0.22,0.25,0.02
Algeria,5430.57,0.60,0.16,0.17,0.01
⋮,⋮,⋮,⋮,⋮,⋮
Yemen,1302.30,0.34,0.14,0.04,0.04
Zambia,1740.64,0.19,0.18,0.33,0.01
Zimbabwe,865.91,0.20,0.32,0.29,0.01


In [27]:
# the following steps are for categorizing GDP_per_capita column into "High", "Medium", and "low"

# arraning the GDP_per_capita column in an ascending order
countries_GDP_eco <- arrange(countries_GDP_eco, GDP_per_capita)

#finding the number of rows that exist
nrow(countries_GDP_eco)

# given that there are about 163 rows, we will categorize the GDP_per_column such that the lowest 54
# values (i.e. 1st to 54th row) are assigned to the "low" category, 
# the next 55 values (i.e. 55th to 109th row) are assigned to the "medium" category
# and the next 54 values (i.e. 110th to 163th row) are assigned to the "high" category 
# so that each category has roughly (163/3) number of elements

# creating a new column name GDP_classified where the corresponding GDP_per_capita is assigned either "low",
# "medium" , or "high"
countries_GDP_eco_classified <- countries_GDP_eco %>% 
                                        mutate(GDP_classified = cut(countries_GDP_eco$GDP_per_capita, 
                                                                    breaks = c(276.68, 2379.44, 10123.9, Inf),  
                                                                    labels = c("low", "medium", "high")))

countries_GDP_eco_classified

Country,GDP_per_capita,cropland_footprint,grazing_footprint,forest_footprint,fish_footprint,GDP_classified
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
Burundi,276.69,0.21,0.07,0.45,0.00,low
"Congo, Democratic Republic of",338.63,0.15,0.01,0.51,0.01,low
Ethiopia,379.38,0.31,0.12,0.46,0.00,low
⋮,⋮,⋮,⋮,⋮,⋮,⋮
Switzerland,88506.2,0.75,0.22,0.38,0.07,high
Qatar,99431.5,0.57,0.27,0.15,0.19,high
Luxembourg,114665.0,1.10,0.76,1.03,0.13,high


In [28]:
# generating the training and testing data sets. A proportion of 75% was dedicated to the training data set,
# and 25% for the testing data set. This was because our data frame had less than 500 rows.

set.seed(1)
countries_split <- initial_split(countries_GDP_eco_classified, prop = 0.75, strata = GDP_classified)
countries_train <- training(countries_split)
countries_test <- testing(countries_split)

# using the glimpse function we can check whether the desired proportion of 75% and 25% was obtained or not. 
# in this case, based on the number of rows, we can see that the obtained proportion is roughly the same as
# as the desired one.
glimpse(countries_train)
glimpse(countries_test)



# the following steps give a summary of the data, using only the training set. 

# table giving the means of each of the predictors
countries_train_predictors <- countries_train %>% select(cropland_footprint, grazing_footprint, 
                                                        forest_footprint, fish_footprint)
mean_of_predictors <- map_df(countries_train_predictors, mean)
mean_of_predictors

Rows: 124
Columns: 7
$ Country            [3m[90m<chr>[39m[23m "Burundi", "Congo, Democratic Republic of", "Ethio…
$ GDP_per_capita     [3m[90m<dbl>[39m[23m 276.69, 338.63, 379.38, 397.38, 410.91, 439.73, 45…
$ cropland_footprint [3m[90m<dbl>[39m[23m 0.21, 0.15, 0.31, 0.23, 0.67, 0.10, 0.39, 0.30, 0.…
$ grazing_footprint  [3m[90m<dbl>[39m[23m 0.07, 0.01, 0.12, 0.03, 0.48, 0.18, 0.32, 0.55, 0.…
$ forest_footprint   [3m[90m<dbl>[39m[23m 0.45, 0.51, 0.46, 0.75, 0.26, 0.06, 0.45, 0.26, 0.…
$ fish_footprint     [3m[90m<dbl>[39m[23m 0.00, 0.01, 0.00, 0.03, 0.02, 0.01, 0.05, 0.02, 0.…
$ GDP_classified     [3m[90m<fct>[39m[23m low, low, low, low, low, low, low, low, low, low, …
Rows: 39
Columns: 7
$ Country            [3m[90m<chr>[39m[23m "Madagascar", "Malawi", "Uganda", "Mali", "Tajikis…
$ GDP_per_capita     [3m[90m<dbl>[39m[23m 456.33, 493.84, 601.35, 741.22, 836.17, 1161.22, 1…
$ cropland_footprint [3m[90m<dbl>[39m[23m 0.27, 0.43, 0.34, 0.52, 0.46, 0

cropland_footprint,grazing_footprint,forest_footprint,fish_footprint
<dbl>,<dbl>,<dbl>,<dbl>
0.5690323,0.2671774,0.3573387,0.1122581


In [29]:
# finding the number of NA observations in our data set. This number should be zero since we excluded any
# NA observation from our data set while tidying the data
sum(is.na(countries_train))

# table showing the number of observations in each class
n_observed_based_GDP_classified <- countries_train %>% group_by(GDP_classified) %>% 
                                    summarise(Count = n())
n_observed_based_GDP_classified

`summarise()` ungrouping output (override with `.groups` argument)



GDP_classified,Count
<fct>,<int>
low,41
medium,42
high,41
