In [1]:
library(xgboost)
library(readr)
library(stringr)
library(caret)
library(car)

set.seed(1)

Loading required package: lattice
Loading required package: ggplot2


In [2]:
ndcg5 <- function(preds, dtrain) {

  labels <- getinfo(dtrain,"label")
  num.class = 12
  pred <- matrix(preds, nrow = num.class)
  top <- t(apply(pred, 2, function(y) order(y)[num.class:(num.class-4)]-1))
  
  x <- ifelse(top==labels,1,0)
  dcg <- function(y) sum((2^y - 1)/log(2:(length(y)+1), base = 2))
  ndcg <- mean(apply(x,1,dcg))
  return(list(metric = "ndcg5", value = ndcg))
}

In [3]:
# load data
df_train = read_csv("../input/train_users_2.csv")
df_test = read_csv("../input/test_users.csv")
labels = df_train['country_destination']
df_train = df_train[-grep('country_destination', colnames(df_train))]

# combine train and test data
df_all = rbind(df_train,df_test)
# remove date_first_booking
df_all = df_all[-c(which(colnames(df_all) %in% c('date_first_booking')))]
# replace missing values
df_all[is.na(df_all)] <- -1

# split date_account_created in year, month and day
dac = as.data.frame(str_split_fixed(df_all$date_account_created, '-', 3))
df_all['dac_year'] = dac[,1]
df_all['dac_month'] = dac[,2]
df_all['dac_day'] = dac[,3]
df_all = df_all[,-c(which(colnames(df_all) %in% c('date_account_created')))]

# split timestamp_first_active in year, month and day
df_all[,'tfa_year'] = substring(as.character(df_all[,'timestamp_first_active']), 1, 4)
df_all['tfa_month'] = substring(as.character(df_all['timestamp_first_active']), 5, 6)
df_all['tfa_day'] = substring(as.character(df_all['timestamp_first_active']), 7, 8)
df_all = df_all[,-c(which(colnames(df_all) %in% c('timestamp_first_active')))]

# clean Age by removing values
df_all[df_all$age < 14 | df_all$age > 100,'age'] <- -1

In [4]:
# one-hot-encoding features
ohe_feats = c('gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser')
dummies <- dummyVars(~ gender + signup_method + signup_flow + language + affiliate_channel + affiliate_provider + first_affiliate_tracked + signup_app + first_device_type + first_browser, data = df_all)
df_all_ohe <- as.data.frame(predict(dummies, newdata = df_all))
df_all_combined <- cbind(df_all[,-c(which(colnames(df_all) %in% ohe_feats))],df_all_ohe)

In [5]:
# split train and test
X = df_all_combined[df_all_combined$id %in% df_train$id,]
y <- recode(labels$country_destination,"'NDF'=0; 'US'=1;
            'other'=2; 'FR'=3; 'CA'=4; 'GB'=5; 'ES'=6; 'IT'=7;
            'PT'=8; 'NL'=9; 'DE'=10; 'AU'=11")
X_test = df_all_combined[df_all_combined$id %in% df_test$id,]

In [None]:
# train xgboost
 
xgb <- xgboost(data = data.matrix(X[,-1]), 
               label = y, 
               eta = 0.001,
               max_depth = 10, 
               nround = 200, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               seed = 1,
               eval_metric = ndcg5,
               objective = "multi:softprob",
               num_class = 12,
               nthread = 4
)

# predict values in test set
y_pred <- predict(xgb, data.matrix(X_test[,-1]))

# extract the 5 classes with highest probabilities
predictions <- as.data.frame(matrix(y_pred, nrow=12))
rownames(predictions) <- c('NDF','US','other','FR','CA','GB','ES','IT','PT','NL','DE','AU')
predictions_top5 <- as.vector(apply(predictions, 2, function(x) names(sort(x)[12:8])))


: parameter seed is ignored, please set random seed using set.seed

In [None]:
# predict values in test set
y_pred <- predict(xgb, data.matrix(X_test[,-1]))

# extract the 5 classes with highest probabilities
predictions <- as.data.frame(matrix(y_pred, nrow=12))
rownames(predictions) <- c('NDF','US','other','FR','CA','GB','ES','IT','PT','NL','DE','AU')
predictions_top5 <- as.vector(apply(predictions, 2, function(x) names(sort(x)[12:8])))

# create submission 
ids <- NULL
for (i in 1:NROW(X_test)) {
  idx <- X_test$id[i]
  ids <- append(ids, rep(idx,5))
}
submission <- NULL
submission$id <- ids
submission$country <- predictions_top5

# generate submission file
submission <- as.data.frame(submission)
write.csv(submission, "sub_feb10.csv", quote=FALSE, row.names = FALSE)

## Cross Validation

In [57]:
set.seed(123)

#dtrain <- xgb.DMatrix(data.matrix(X[,-1]), label = y)
params <- list(eta = 0.3,
               max_depth = 1, 
               subsample = 0.5,
               colsample_bytree = 0.5,
               eval_metric = ndcg5,
               objective = "multi:softprob",
               num_class = 12)

xgb.cv <- xgboost(params,
                  data = data.matrix(X[,-1]),
                  label = y,
                  nround = 8,
                  nfold = 4,
                  nthread = 4,
                  verbose = 0
)



In [56]:
num_rounds = 100
maximum_depth = 10
cat("max_depth =", maximum_depth, "| nround =", num_rounds)

max_depth = 10 | nround = 100

In [58]:
res <- xgb.cv(params = params, data = dtrain, nrounds = 8, nfold = 4, prediction = FALSE)

[0]	train-ndcg5:0.806484+0.000520	test-ndcg5:0.806500+0.000489
[1]	train-ndcg5:0.806753+0.000234	test-ndcg5:0.806684+0.000680
[2]	train-ndcg5:0.806755+0.000241	test-ndcg5:0.806726+0.000650
[3]	train-ndcg5:0.806769+0.000226	test-ndcg5:0.806721+0.000642
[4]	train-ndcg5:0.807006+0.000308	test-ndcg5:0.806916+0.000991
[5]	train-ndcg5:0.807418+0.000636	test-ndcg5:0.807302+0.001225
[6]	train-ndcg5:0.807906+0.000846	test-ndcg5:0.807873+0.001273
[7]	train-ndcg5:0.808631+0.000835	test-ndcg5:0.808643+0.000554


## Grid Search

In [None]:
xgb_grid_1 = expand.grid(nrounds = 25,
                        eta = c(0.01, 0.001, 0.0001),
                        max_depth = c(2,4, 6,8, 10, 12, 14),
                        gamma = 1)

In [None]:
xgb_train_control = trainControl(
    method = "cv",
    number = 5,
    verboseIter = TRUE,
    returnData = FALSE,
    returnResamp = "all",
    classProbs = TRUE,
    summaryFunction = twoClassSummary,
    allowParallel = TRUE
)

In [None]:
xgb_train_1 = train(
  x = data.matrix(X[,-1]), 
  y = y, 
  trControl = xgb_train_control,
  tuneGrid = xgb_grid_1,
  method = "xgbTree"
)

In [None]:
xgb_tune <- train(x = data.matrix(X[,-1]), 
                  y = y, 
                  method = "xgbTree",
                  trControl = xgb_train_control,
                  tunGrid = xgb_grid_1,
                  verbose = T,
                  subsample = 0.5,
                  colsample_bytree = 0.5,
                  seed = 1,
                  eval_metric = ndcg5,
                  objective = "multi:softprob",
                  num_class = 12,
                  nthread = 4,
)