In [None]:
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

system("ls ../input")

library(xgboost)
library(Matrix)

library(stringr)
library(caret)
library(car)

set.seed(1)
options(scipen=999)

In [None]:
# ---------------------------------------------------
# Load
orig.train <- read.csv("../input/train.csv", stringsAsFactors = F)
orig.test <- read.csv("../input/test.csv", stringsAsFactors = F)
sample.submission <- read.csv("../input/sample_submission.csv", stringsAsFactors = F)

# ---------------------------------------------------
# Merge
orig.test$TARGET <- -1
merged <- rbind(orig.train, orig.test)

In [None]:
# Convert
feature.train.names <- names(orig.train)[-1]
for (f in feature.train.names) {
  if (class(merged[[f]]) == "numeric") {
    merged[[f]] <- merged[[f]] / max(merged[[f]])
  } else if (class(merged[[f]]) == "integer") {
    u <- unique(merged[[f]])
    if (length(u) == 1) {
      merged[[f]] <- NULL
    } else if (length(u) < 200) {
      merged[[f]] <- factor(merged[[f]])
    }
  }
}

# Split
train <- merged[merged$TARGET != -1, ]
test <- merged[merged$TARGET == -1, ]

# Features
feature.names <- names(train)
feature.names <- feature.names[-grep('^ID$', feature.names)]
feature.names <- feature.names[-grep('^TARGET$', feature.names)]
feature.formula <- formula(paste('TARGET ~ ', paste(feature.names, collapse = ' + '), sep = ''))

In [None]:
# Matrix
indexes <- sample(seq_len(nrow(train)), floor(nrow(train)*0.85))

data <- sparse.model.matrix(feature.formula, data = train[indexes, ])
sparseMatrixColNamesTrain <- colnames(data)
dtrain <- xgb.DMatrix(data, label = train[indexes, 'TARGET'])
rm(data)
dvalid <- xgb.DMatrix(sparse.model.matrix(feature.formula, data = train[-indexes, ]),
                      label = train[-indexes, 'TARGET'])
dtest <- sparse.model.matrix(feature.formula, data = test)

watchlist <- list(valid = dvalid, train = dtrain)

In [None]:
# XGBOOST
params <- list(booster = "gbtree", objective = "binary:logistic",
               max_depth = 8, eta = 0.05,
               colsample_bytree = 0.65, subsample = 0.95)

model <- xgb.train(params = params, data = dtrain,
                   nrounds = 500, early.stop.round = 50,
                   eval_metric = 'auc', maximize = T, nfold = 5,
                   watchlist = watchlist, print.every.n = 50, verbose = 0)

pred <- predict(model, dtest)

## Cross Validation

In [None]:
X_train <- train[indexes, ]
y_train <- train$TARGET

In [None]:
xgb_grid_1 = expand.grid(nrounds = 25,
                        eta = c(0.05, 0.01, 0.1),
                        max_depth = c(2, 4, 6),
                        booster = "gbtree", objective = "binary:logistic",
                        colsample_bytree = 0.65, subsample = 0.95,
                        gamma = 1)


xgb_train_control = trainControl(
    method = "cv",
    number = 5,
    verboseIter = TRUE,
    returnData = FALSE,
    returnResamp = "all",
    classProbs = TRUE,
    summaryFunction = twoClassSummary,
    allowParallel = TRUE
)


xgb_tune <- train(params = params, 
                  x = data.matrix(X_train),
                  y = y_train,
                  trControl = xgb_train_control,
                  tunGrid = xgb_grid_1,
                  verbose = T,
                  seed = 1,
                  eval_metric = 'auc',
                  num_class = 2,
                  nthread = 4
)


In [None]:
submission <- data.frame(ID = test$ID, TARGET = pred)
write.csv(submission, '../output/xgboost_Mar03.csv', row.names=FALSE, quote = FALSE)


# Xgboost Mar06

In [1]:
library(xgboost)
library(Matrix)

set.seed(1234)

train <- read.csv("../input/train.csv")
test  <- read.csv("../input/test.csv")

##### Removing IDs
train$ID <- NULL
test.id <- test$ID
test$ID <- NULL

##### Extracting TARGET
train.y <- train$TARGET
train$TARGET <- NULL

##### 0 count per line
count0 <- function(x) {
    return( sum(x == 0) )
}
train$n0 <- apply(train, 1, FUN=count0)
test$n0 <- apply(test, 1, FUN=count0)

##### Removing constant features
cat("\n## Removing the constants features.\n")
for (f in names(train)) {
    if (length(unique(train[[f]])) == 1) {
        cat(f, "is constant in train. We delete it.\n")
        train[[f]] <- NULL
        test[[f]] <- NULL
    }
}


## Removing the constants features.
ind_var2_0 is constant in train. We delete it.
ind_var2 is constant in train. We delete it.
ind_var27_0 is constant in train. We delete it.
ind_var28_0 is constant in train. We delete it.
ind_var28 is constant in train. We delete it.
ind_var27 is constant in train. We delete it.
ind_var41 is constant in train. We delete it.
ind_var46_0 is constant in train. We delete it.
ind_var46 is constant in train. We delete it.
num_var27_0 is constant in train. We delete it.
num_var28_0 is constant in train. We delete it.
num_var28 is constant in train. We delete it.
num_var27 is constant in train. We delete it.
num_var41 is constant in train. We delete it.
num_var46_0 is constant in train. We delete it.
num_var46 is constant in train. We delete it.
saldo_var28 is constant in train. We delete it.
saldo_var27 is constant in train. We delete it.
saldo_var41 is constant in train. We delete it.
saldo_var46 is constant in train. We delete it.
imp_amort_var18_hace3 i

In [2]:
##### Removing identical features
features_pair <- combn(names(train), 2, simplify = F)
toRemove <- c()
for(pair in features_pair) {
    f1 <- pair[1]
    f2 <- pair[2]
    
    if (!(f1 %in% toRemove) & !(f2 %in% toRemove)) {
        if (all(train[[f1]] == train[[f2]])) {
            cat(f1, "and", f2, "are equals.\n")
            toRemove <- c(toRemove, f2)
        }
    }
}

feature.names <- setdiff(names(train), toRemove)

train <- train[, feature.names]
test <- test[, feature.names]

train$TARGET <- train.y

ind_var6_0 and ind_var29_0 are equals.
ind_var6 and ind_var29 are equals.
ind_var13_medio_0 and ind_var13_medio are equals.
ind_var18_0 and ind_var18 are equals.
ind_var26_0 and ind_var26 are equals.
ind_var25_0 and ind_var25 are equals.
ind_var32_0 and ind_var32 are equals.
ind_var34_0 and ind_var34 are equals.
ind_var37_0 and ind_var37 are equals.
ind_var40 and ind_var39 are equals.
num_var6_0 and num_var29_0 are equals.
num_var6 and num_var29 are equals.
num_var13_medio_0 and num_var13_medio are equals.
num_var18_0 and num_var18 are equals.
num_var26_0 and num_var26 are equals.
num_var25_0 and num_var25 are equals.
num_var32_0 and num_var32 are equals.
num_var34_0 and num_var34 are equals.
num_var37_0 and num_var37 are equals.
num_var40 and num_var39 are equals.
saldo_var6 and saldo_var29 are equals.
saldo_var13_medio and saldo_medio_var13_medio_ult1 are equals.
delta_imp_reemb_var13_1y3 and delta_num_reemb_var13_1y3 are equals.
delta_imp_reemb_var17_1y3 and delta_num_reemb_var17_1y

In [3]:
train <- sparse.model.matrix(TARGET ~ ., data = train)

dtrain <- xgb.DMatrix(data=train, label=train.y)
watchlist <- list(train=dtrain)

In [6]:
param <- list(  objective           = "binary:logistic", 
                booster             = "gbtree",
                eval_metric         = "auc",
                eta                 = 0.02,
                max_depth           = 8,
                subsample           = 0.9,
                colsample_bytree    = 0.85
)

xgb.cv <- xgboost(  params              = param, 
                    data                = dtrain, 
                    nrounds             = nrd, 
                    verbose             = 0,
                    watchlist           = watchlist,
                    maximize            = FALSE)

In [None]:
res <- xgb.cv(params = param, data = dtrain, 
              stratified = TRUE,
              nrounds = 350, early.stop.round = 50,
              nfold = 4, 
              print.every.n = 50,
              prediction = FALSE)