In [1]:
# Load up diamonds data set
library(ggplot2)

# view rows and columns
dim(diamonds) 

In [2]:
# make factors into bit fields

library(dplyr, warn.conflicts = FALSE)

# NOTE: this time, other cuts removed. We are going to predict: Ideal or Not
diamonds <- mutate(diamonds,
                   
    # cut            
    fair_cut = ifelse(cut == 'Fair', 1, 0),
    good_cut = ifelse(cut == 'Good', 1, 0),
    vgood_cut = ifelse(cut == 'Very Good', 1, 0),
    prem_cut = ifelse(cut == 'Premium', 1, 0),
    ideal_cut = ifelse(cut == 'Ideal', 1, 0),
                   
    # color
    D = ifelse(color == 'D', 1, 0),
    E = ifelse(color == 'E', 1, 0),
    F = ifelse(color == 'F', 1, 0),
    G = ifelse(color == 'G', 1, 0),
    H = ifelse(color == 'H', 1, 0),
    I = ifelse(color == 'I', 1, 0),
    J = ifelse(color == 'J', 1, 0)                  
)

# remove the factors

diamonds <- select(diamonds, -color, -cut)

In [3]:
# convert factor to a numeric code
diamonds[diamonds$clarity == 'I1',   'clarity_num'] = 0
diamonds[diamonds$clarity == 'IF',   'clarity_num'] = 1
diamonds[diamonds$clarity == 'SI1',  'clarity_num'] = 2
diamonds[diamonds$clarity == 'SI2',  'clarity_num'] = 3
diamonds[diamonds$clarity == 'VS1',  'clarity_num'] = 4
diamonds[diamonds$clarity == 'VS2',  'clarity_num'] = 5
diamonds[diamonds$clarity == 'VVS1', 'clarity_num'] = 6
diamonds[diamonds$clarity == 'VVS2', 'clarity_num'] = 7

In [4]:
# randomize the order

n <- nrow(diamonds)
diamonds <- diamonds[sample(n), ]

# use 80% of data to train, 20% to test

split <- floor(n * 0.80)
train <- diamonds[1:split,]
test <- diamonds[-(1:split),]

dim(train)
dim(test)

In [5]:
y = data.matrix(train[, 'clarity_num'])
X = data.matrix(select(train, -clarity, -clarity_num))
dim(y)
dim(X)

In [10]:
library(xgboost, warn.conflicts = FALSE)

# train
model = xgboost(data = X, 
                label = y,
                nthread = 4,
                max.depth = 20,
                alpha = 1.5,
                lambda = 1.5,
                nrounds = 100,
                objective = "multi:softmax",
                num_class = 8,
                verbose = 0)

# predict
test$predicted <- xgboost::predict(model, data.matrix(select(test, -clarity, -clarity_num)))

# show a sample
select(test, clarity_num, predicted) %>%
   sample_n(10)

Unnamed: 0,clarity_num,predicted
1,5,5
2,3,3
3,2,2
4,3,3
5,5,2
6,5,5
7,3,3
8,4,7
9,6,4
10,2,3


In [11]:
# show accuracy

acc <- sum(test$predicted == test$clarity_num) / nrow(test)

print(acc)

[1] 0.6959585
