In [101]:
# Import packages
library(readr)
library(plyr)
library(ipred)
library(caret)
library(caTools)
library(randomForest)
library(ROSE)

In [102]:
# Read in data
data <- read.csv("process.csv")

In [103]:
# Process the data by setting the first 14 columns as [features] and the last column as the [label] # nolint: line_length_linter.
features <- data[, 1:14]
label <- data[, 15]

In [104]:
# Split the dataset into training and testing sets
set.seed(42)
split <- sample.split(label, SplitRatio = 0.7)
features_train = features[split,]  # nolint
features_test = features[!split,]  # nolint
label_train = label[split]  # nolint
label_test = label[!split]  # nolint

In [105]:
# Combine the training datasets
data_train <- features_train
data_train$label <- label_train
class_counts <- table(data_train$label)

In [106]:
# Print the initial number of each category
class_counts <- table(data_train$label)
print(paste("( Before )Data Category Counts: ", class_counts))

[1] "( Before )Data Category Counts:  490"
[2] "( Before )Data Category Counts:  148"
[3] "( Before )Data Category Counts:  23" 


In [107]:
# Handle class imbalance
# Process classes A and B
data_train_AB <- data_train  # nolint
data_train_AB <- data_train_AB[data_train_AB$label != '3',]  # nolint
data_train_AB_resampled <- ovun.sample(label ~ ., data = data_train_AB, method = "over", N = 980, seed = 1)$data  # nolint

# Process classes A and C
data_train_AC <- data_train  # nolint
data_train_AC <- data_train_AC[data_train_AC$label != '2',]  # nolint
data_train_AC_resampled <- ovun.sample(label ~ ., data = data_train_AC, method = "over", N = 980, seed = 1)$data  # nolint

# Retain records in data_train_AB_resampled where the label is '2'
data_train_AB_2 <- data_train_AB_resampled[data_train_AB_resampled$label == '2',]  # nolint

# Retain records in data_train_AC_resampled where the label is '3'
data_train_AC_3 <- data_train_AC_resampled[data_train_AC_resampled$label == '3',]  # nolint

# Retain records in both data_train_AB_resampled and data_train_AC_resampled where the label is '1' # nolint: line_length_linter.
data_train_1 <- data_train_AB_resampled[data_train_AB_resampled$label == '1',]  # nolint

# combine
data_train_combined <- rbind(data_train_1, data_train_AB_2, data_train_AC_3)

# Print the number of each category after class imbalance handling
cat("( After )Data Category Counts:\n")
print(table(data_train_combined$label))

( After )Data Category Counts:

  1   2   3 
490 490 490 


In [108]:
# Divide the features and label, and apply different normalization to the training and testing sets # nolint: line_length_linter.
features_train <- data_train_combined[, 1:14]
label <- data_train_combined[, 15]

In [109]:
# Apply Min-Max normalization to features_train and features_test
features_train <- as.data.frame(lapply(features_train, function(x) {(x-min(x))/(max(x)-min(x))}))  # nolint
features_test <- as.data.frame(lapply(features_test, function(x) {(x-min(x))/(max(x)-min(x))}))  # nolint

# Apply Min-Max normalization to features_train and features_test
features_train_1 <- as.data.frame(lapply(features_train, function(x) {(x-min(x))/(max(x)-min(x))}))  # nolint
features_test_1 <- as.data.frame(lapply(features_test, function(x) {(x-min(x))/(max(x)-min(x))}))  # nolint

# Apply Z-Score normalization to each column of features_train
features_train_2 <- as.data.frame(lapply(features_train, function(x) {(x - mean(x))/sd(x)}))  # nolint

#features_test <- as.data.frame(mapply(function(x, y) {(x - mean(y))/sd(y)}, features_test, features_train, SIMPLIFY = FALSE))  # nolint
features_test_2 <- as.data.frame(lapply(features_test, function(x) {(x - mean(x))/sd(x)}))  # nolint

# Apply Max Absolute Value normalization to the training set
features_train_3 <- as.data.frame(lapply(features_train, function(x) {x / max(abs(x))}))  # nolint
features_test_3 <- as.data.frame(lapply(features_test, function(x) {x / max(abs(x))}))  # nolint

# Apply L1 norm normalization to the training set
features_train_4 <- as.data.frame(lapply(features_train, function(x) {x / sum(abs(x))}))  # nolint
features_test_4 <- as.data.frame(lapply(features_test, function(x) {x / sum(abs(x))}))  # nolint

# Apply L2 norm normalization to the training set
features_train_5 <- as.data.frame(lapply(features_train, function(x) {x / sqrt(sum(x^2))}))  # nolint
features_test_5 <- as.data.frame(lapply(features_test, function(x) {x / sqrt(sum(x^2))}))  # nolint

In [110]:
# Training dataset after Min-Max normalization
data_train <- features_train
data_train$label <- label
class_counts <- table(data_train$label)
#print(class_counts)  # nolint

# Training dataset after Min-Max normalization
data_train_1 <- features_train_1
data_train_1$label <- label
class_counts_1 <- table(data_train_1$label)
#print(class_counts_1)  # nolint

# Training dataset after Z-Score normalization
data_train_2 <- features_train_2
data_train_2$label <- label
class_counts_2 <- table(data_train_2$label)
#print(class_counts_2)  # nolint

# Training dataset after Max Absolute Value normalization
data_train_3 <- features_train_3
data_train_3$label <- label
class_counts_3 <- table(data_train_3$label)
#print(class_counts_3)  # nolint

# Training dataset after L1 norm normalization
data_train_4 <- features_train_4
data_train_4$label <- label
class_counts_4 <- table(data_train_4$label)
#print(class_counts_4)  # nolint

# Training dataset after L2 norm normalization
data_train_5 <- features_train_5
data_train_5$label <- label
class_counts_5 <- table(data_train_5$label)
#print(class_counts_5)  # nolint

In [111]:
model_1 <- randomForest(x = data_train_1[-ncol(data_train_combined)], y = as.factor(data_train_1$label), ntree = 2)  # nolint
variable_importance_1 = importance(model_1)  # nolint
pred_comb_1 <- predict(model_1, features_test_1)

# Calculate and print the accuracy of the test set
accuracy_1 <- sum(label_test == pred_comb_1) / length(label_test)
print(paste('Accuracy of Min-Max Normalization:', accuracy_1))  # nolint

# Convert to a categorical variable
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_1)

# Obtain the confusion matrix
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)

# Calculate the average recall rate (Sensitivity)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], cm$byClass["Class: 2", "Sensitivity"], cm$byClass["Class: 3", "Sensitivity"]))  # nolint

# Calculate the average precision rate (Positive Predictive Value)
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], cm$byClass["Class: 2", "Pos Pred Value"], cm$byClass["Class: 3", "Pos Pred Value"]))  # nolint
F1 = 2 * recall * precision / ( recall + precision )  # nolint

# print results
print(paste('Recall :', recall))  # nolint
print(paste('Precision :', precision))  # nolint
print(paste('F1 :', F1))  # nolint

[1] "Accuracy of Min-Max Normalization: 0.929328621908127"
[1] "Recall : 0.871428571428571"
[1] "Precision : 0.893793535898799"
[1] "F1 : 0.882469374145375"


In [112]:
model_2 <- randomForest(x = data_train_2[-ncol(data_train_combined)], y = as.factor(data_train_2$label), ntree = 2) # nolint
variable_importance_2 = importance(model_2) # nolint
pred_comb_2 <- predict(model_2, features_test_2)

# Calculate and print the accuracy of the test set
accuracy_2 <- sum(label_test == pred_comb_2) / length(label_test)
print(paste('Z-Score Normalization Accuracy:', accuracy_2)) # nolint

# Convert to a factor
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_2)

# Obtain the confusion matrix
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)

# Calculate the average recall rate (Sensitivity)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"],  # nolint
                              cm$byClass["Class: 2", "Sensitivity"],  # nolint
                              cm$byClass["Class: 3", "Sensitivity"]))

# Calculate the average precision rate (Positive Predictive Value)
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], # nolint 
                            cm$byClass["Class: 2", "Pos Pred Value"], # nolint 
                            cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision ) # nolint

# print results
print(paste('Recall :', recall))  # nolint
print(paste('Precision :', precision))  # nolint
print(paste('F1 :', F1))  # nolint

[1] "Z-Score Normalization Accuracy: 0.328621908127208"
[1] "Recall : 0.484656084656085"
[1] "Precision : 0.368749858324833"
[1] "F1 : 0.418832008431878"


In [113]:
model_3 <- randomForest(x = data_train_3[-ncol(data_train_combined)], y = as.factor(data_train_3$label), ntree = 2) # nolint
variable_importance_3 = importance(model_3) # nolint
pred_comb_3 <- predict(model_3, features_test_3)

# Calculate and print the accuracy of the test set
accuracy_3 <- sum(label_test == pred_comb_3) / length(label_test)
print(paste('Accuracy of Max absolute value normalization:', accuracy_3)) # nolint

# Convert to a factor
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_3)

# Obtain the confusion matrix
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)

# Calculate the average recall rate (Sensitivity)
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], # nolint
                              cm$byClass["Class: 2", "Sensitivity"], # nolint
                              cm$byClass["Class: 3", "Sensitivity"]))

# Calculate the average precision rate (Positive Predictive Value)
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 2", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision ) # nolint

# print results
print(paste('Recall :', recall)) # nolint
print(paste('Precision :', precision)) # nolint
print(paste('F1 :', F1)) # nolint

[1] "Accuracy of Max absolute value normalization: 0.886925795053004"


[1] "Recall : 0.798412698412698"
[1] "Precision : 0.798412698412699"
[1] "F1 : 0.798412698412698"


In [114]:
model_4 <- randomForest(x = data_train_4[-ncol(data_train_combined)], y = as.factor(data_train_4$label), ntree = 2)  # nolint
variable_importance_4 = importance(model_4) # nolint
pred_comb_4 <- predict(model_4, features_test_4)

# Calculate and print the accuracy of the test set
accuracy_4 <- sum(label_test == pred_comb_4) / length(label_test)
print(paste('Accuracy of L1 norm normalization:', accuracy_4)) # nolint

# Convert to a factor
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_4)

# Obtain the confusion matrix
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)

# Calculate the average recall（Sensitivity）
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], # nolint
                              cm$byClass["Class: 2", "Sensitivity"], # nolint
                              cm$byClass["Class: 3", "Sensitivity"]))

# Calculate the average precision（Pos Pred Value）
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 2", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision ) # nolint

# print results
print(paste('Recall :', recall)) # nolint
print(paste('Precision :', precision)) # nolint
print(paste('F1 :', F1)) # nolint

[1] "Accuracy of L1 norm normalization: 0.763250883392226"


[1] "Recall : 0.585185185185185"
[1] "Precision : 0.597558658687185"
[1] "F1 : 0.591307198349883"


In [115]:
model_5 <- randomForest(x = data_train_5[-ncol(data_train_combined)], y = as.factor(data_train_5$label), ntree = 2) # nolint
variable_importance_5 = importance(model_5) # nolint
pred_comb_5 <- predict(model_5, features_test_5)

# Calculate and print the accuracy of the test set
accuracy_5 <- sum(label_test == pred_comb_5) / length(label_test)
print(paste('Accuracy of L2 norm normalization:', accuracy_5)) # nolint

# Convert to a factor
label_test_factor <- as.factor(label_test)
pred_comb_1_factor <- as.factor(pred_comb_5)

# Obtain the confusion matrix
cm <- confusionMatrix(pred_comb_1_factor, label_test_factor)

# Calculate the average recall（Sensitivity）
recall <- mean(c(cm$byClass["Class: 1", "Sensitivity"], # nolint
                              cm$byClass["Class: 2", "Sensitivity"], # nolint
                              cm$byClass["Class: 3", "Sensitivity"]))

# Calculate the average precision（Pos Pred Value）
precision <- mean(c(cm$byClass["Class: 1", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 2", "Pos Pred Value"], # nolint
                            cm$byClass["Class: 3", "Pos Pred Value"]))
F1 = 2 * recall * precision / ( recall + precision ) # nolint

# print results
print(paste('Recall :', recall)) # nolint
print(paste('Precision :', precision)) # nolint
print(paste('F1 :', F1)) # nolint

[1] "Accuracy of L2 norm normalization: 0.724381625441696"
[1] "Recall : 0.614285714285714"
[1] "Precision : 0.517725766027973"
[1] "F1 : 0.56188748527618"


In [116]:
# Obtain the number of predicted results
n <- length(pred_comb_1)

# Initialize an empty vector to store the final prediction results
final_pred <- rep(NA, n)

# Iterate over each test sample
for(i in 1:n) { # nolint
   # Get the prediction results of the five models for the i-th sample # nolint
   preds <- c(pred_comb_1[i], pred_comb_2[i], pred_comb_3[i], pred_comb_4[i], pred_comb_5[i]) # nolint

   # Select the most frequently predicted class as the final prediction result for the i-th sample # nolint
   final_pred[i] <- as.numeric(names(which.max(table(preds))))
}


importances_list <- list(variable_importance_1, variable_importance_2, variable_importance_3, variable_importance_4, variable_importance_5) # nolint
average_importance <- Reduce("+", importances_list) / length(importances_list)
print(average_importance)


# Now final_pred contains the prediction results after voting
# Calculate and print the accuracy
accuracy <- sum(label_test == final_pred) / length(label_test)
print(paste('Accuracy of Voting method:', accuracy)) # nolint

# Convert to a factor type
final_pred_factor <- as.factor(final_pred)
label_test_factor <- as.factor(label_test)

# Obtain the confusion matrix
cm_vote <- confusionMatrix(final_pred_factor, label_test_factor)

# Calculate the recall (Sensitivity) for each category
sensitivity_class1 <- cm_vote$byClass["Class: 1", "Sensitivity"]
sensitivity_class2 <- cm_vote$byClass["Class: 2", "Sensitivity"]
sensitivity_class3 <- cm_vote$byClass["Class: 3", "Sensitivity"]
recall = (sensitivity_class1 + sensitivity_class2 + sensitivity_class3) / 3 # nolint
print(paste('Recall :', recall)) # nolint

# Calculate the precision for each category
precision_class1 <- cm_vote$byClass["Class: 1", "Pos Pred Value"]
precision_class2 <- cm_vote$byClass["Class: 2", "Pos Pred Value"]
precision_class3 <- cm_vote$byClass["Class: 3", "Pos Pred Value"]
precision = (precision_class1 + precision_class2 + precision_class3) / 3 # nolint
print(paste('Precision :', precision)) # nolint

F1 = 2 * recall * precision / ( recall + precision ) # nolint
print(paste('F1 :', F1)) # nolint

                    MeanDecreaseGini
GenSpec                    29.703153
LF                          2.937468
GF                         16.170843
Fam                        47.254084
Biomes                     81.827397
Status                    400.625595
Range                     291.161674
Habitat_degradation        16.889857
Habitat_loss               40.730262
IAS                         8.648279
Other                       3.254064
Over_exploitation           4.621536
Pollution                   2.145126
Unknown                     8.869024
[1] "Accuracy of Voting method: 0.88339222614841"
[1] "Recall : 0.752380952380952"
[1] "Precision : 0.925706214689266"
[1] "F1 : 0.830092425590586"
