## TODO

- need to select appropriate features. right now, all columns are used as features
- try using nested mixed logit model
- stratified sampling for caret CV

### packages

In [2]:
library(mlogit)
library(nnet)
library(glmnet)
library(randomForest)
library(rpart)
library(rpart.plot)
library(xgboost)
library(plyr)
library(dplyr)
library(methods)
library(data.table)
library(magrittr)
library(LiblineaR)
library(caret)
library(parallel)
library(doParallel)


### run all model fitting and prediction here and output results to file (long computation time)

In [None]:
## place all model fit/predict wrapper functions here
models <- c(mnlogit, cart, rf)

# enable metamodel stacking??
isStacked <- T
stacked <- data.frame()
probs <- data.frame()


# run ALL models in list
# NOTE: this loop will take a long ass time even with 36 cores
for (m in 1:length(models)) {
    
    # run model fitting and prediction, and extract predicted probabilities
    # if model stacking enabled, get classes. Else, get probabilities
    pred <- models[[m]](prob=(!isStacked))

    # metamodel stacking
    if (isStacked) {
        stacked <- cbind.all(stacked, pred)
        colnames(stacked)[m] <- paste("M", m, sep="")
    } else {
        probs <- cbind.all(probs, pred)
    }
}


# prepare for metamodeling if stacking is enabled
if (isStacked) {
        stacked <- cbind.all(stacked, actual.train)
        colnames(stacked)[NCOL(stacked)] <- "actualChoice"
}


In [None]:
# if (isStacked) {
#     stacked
# } else {
#     probs
# }

# preprocessing all predictions 
stackedDF <- as.data.frame(stacked)
str(stackedDF)

## do model averaging
MM <- multinom(actualChoice ~ M1+M2+M3, data = stackedDF)
p <- predict(MM, stackedDF)
stackedDF$pred <- p

# get percent accuracy according to model used
NROW(stackedDF[stackedDF$actualChoice == stackedDF$M1, ]) # mlogit
NROW(stackedDF[stackedDF$actualChoice == stackedDF$M2, ]) # cart
NROW(stackedDF[stackedDF$actualChoice == stackedDF$M3, ]) # rf
NROW(stackedDF[stackedDF$actualChoice == stackedDF$pred, ]) # stacked model

stackedDF

____________

## place all helper functions below

### fn to append to empty dataframe

In [110]:
# function to calculate multinomial logloss
logLoss <- function(pred, actual){
  -1*mean(log(pred[model.matrix(~ actual + 0) - pred > 0]))
}

# CBIND TO DF
cbind.all <- function (...) {
    nm <- list(...)
    nm <- lapply(nm, as.matrix)
    n <- max(sapply(nm, nrow))
    do.call(cbind, lapply(nm, function(x) rbind(x, matrix(, n - 
        nrow(x), ncol(x)))))
}

# permutate list of incrementing integers
perm <- function(n) {
    if(n==1){
        return(matrix(1))
    } else {
        sp <- perm(n-1)
        p <- nrow(sp)
        A <- matrix(nrow=n*p,ncol=n)
        for(i in 1:n){
            A[(i-1)*p+1:p,] <- cbind(i,sp+(sp>=i))
        }
        return(A)
    }
} 


# permutate for all datasets
gen2 <- function(data) {

    names <- colnames(data)
    p <- perm(3)
    p <- p[2:NROW(p),]
    results <- data.frame()

    spl <- list()
    spl$alts <- list()
    spl$alts$a <- data[,c(4:23)]
    spl$alts$b <- data[,c(24:43)]
    spl$alts$c <- data[,c(44:63)]
    spl$ch <- list()
    spl$ch$a <- data[,c(95)]
    spl$ch$b <- data[,c(96)]
    spl$ch$c <- data[,c(97)]
    spl$Choice <- ifelse(data[,c(99)]=="Ch1", 1,
                        ifelse(data[,c(99)]=="Ch2", 2,
                                ifelse(data[,c(99)]=="Ch3", 3, 4)))
    for (i in 1:NROW(p)) {
        df <- data.frame(data[,c(1:3)],
                        spl$alts[[p[i,1]]], spl$alts[[p[i,2]]], spl$alts[[p[i,3]]], data[,c(64:83)],
                        data[,c(84:94)],
                        spl$ch[[p[i,1]]], spl$ch[[p[i,2]]], spl$ch[[p[i,3]]], data[,c(98)],
                        ifelse(spl$Choice!=4, paste("Ch", match(spl$Choice, p[i,]), sep=""), "Ch4"))
        colnames(df) <- names
        results <- rbind(results, df)
    }
    
    return(results)
}

# # NROW(data.train.small)
# haha2 <- gen(data.train.small)

### Import train/test sets

In [111]:
# import training set
data.train <- subset(read.csv("csv/train.csv"), Task<=19)
# data.train.small <- subset(read.csv("csv/train.csv"), Task<=12)
data.train.small <- subset(read.csv("csv/train.csv"), Case<=600)
# import validation set
# data.valid <- subset(read.csv("csv/train.csv"), Task>=13)
data.valid <- subset(read.csv("csv/train.csv"), Case>600)
# import test set
data.test <- read.csv("csv/test.csv")

# releveling `income` factor levels to be consistent throughout
fac.income <- union(levels(data.test$income), levels(data.train$income))
data.train$income <- factor(data.train$income, levels = fac.income)
data.train.small$income <- factor(data.train.small$income, levels = fac.income)
data.valid$income <- factor(data.valid$income, levels = fac.income)
data.test$income <- factor(data.test$income, levels = fac.income)

# adding values to `Ch` andd `Choice` levels to prevent errors
data.test[,c("Ch1", "Ch2", "Ch3", "Ch4")] <- 1
data.test[,c("Choice")] <- "Ch1"
# data.train[,c(4:83)] <- data.train[,c(4:83)] + 1
# data.valid[,c(4:83)] <- data.valid[,c(4:83)] + 1
# data.test[,c(4:83)] <- data.test[,c(4:83)] + 1

# bootstrapping
data.train.mod <- rbind(data.train, gen2(data.train))
data.train.small.mod <- rbind(data.train.small, gen2(data.train.small))
# data.valid <- rbind(data.valid, gen(data.valid))



# reordering each alternative from most expensive to least expensive
# ties are broken by placing those with least features first
mapping <- function(df) {
#     for (r in 1:22) {
    for (r in 1:NROW(df)) {
        alts <- list()
        alts$A1 <- df[r,c(4:22)]
        alts$A2 <- df[r,c(24:42)]
        alts$A3 <- df[r,c(44:62)]
        costs <- c(df[r,c(23)], df[r,c(43)], df[r,c(63)])
        ch <- c(df[r,c(95)], df[r,c(96)], df[r,c(97)])
        Choice <- df[r,c(99)]
        map <- rank(costs, ties="first")
#         cat("map: ", map, "\n")
        inv.map <- rank(map, ties="first")
#         cat("inv: ", inv.map, "\n")


        # extract orders
        for (m in 1:3) {
            rk.new <- match(m, map)
            df[r,c(((m-1)*20+4):((m-1)*20+22))] <- alts[[rk.new]] # all car safety features
            df[r,c(23+20*(m-1))] <- costs[rk.new] # all alternative prices
            df[r,c(95+m-1)] <- ch[rk.new] # all binary choices
        }
        if (!is.na(df[r,c(99)])) {
            Choice.new <- df[r,c(99)] %>% gsub('Ch', '', .) %>% {as.integer(.)} #factorial choice
            if (Choice.new != 4) {
                df[r,c(99)] <- paste("Ch", map[Choice.new], sep="")
            }
        }
#         first <- match(1, map)
#         first.alt <- alts[[first]]
#         first.cost <- costs[first]
#         second <- match(2, map)
#         second.alt <- alts[[second]]
#         second.cost <- costs[second]
#         third <- match(3, map)
#         third.alt <- alts[[third]]
#         third.cost <- costs[third]

        # inputtting into dataframe
#         df[r,c(4:22)] <- first.alt
#         df[r,c(23)] <- first.cost
#         df[r,c(24:42)] <- second.alt
#         df[r,c(43)] <- second.cost
#         df[r,c(44:62)] <- third.alt
#         df[r,c(63)] <- third.cost

        # appending to output
#         out <- list()
#         out$map <- map
#         out$invmap <- inv.map
#         out$df <- df
    }
    return(df)
}
# data.train.order <- mapping(data.train)
# data.valid.order <- mapping(data.valid)
# data.test.order <- mapping(data.test)


# get actual choices
actual.train <- data.train[,"Choice"]
actual.train.small <- data.train.small[,"Choice"]
actual.valid <- data.valid[,'Choice']

### sad attempt at mlogit

In [None]:
# method, tuneGrid, metric

## preprocess data for mlogit model
mnlogit.preprocess <- function(train, valid, test) {
  
    
    
    
  # list to store processed dataframes
  out <- list()
  
  # process training set
  M.train <- mlogit.data(train,
                         shape = "wide",
                         choice = "Choice",
                         sep = "",
                         varying = c(4:83),
                         alt.levels = c("Ch1", "Ch2", "Ch3", "Ch4"),
                         id.var = "Case")
  out$train <- M.train
  
  # process training set
  M.valid <- mlogit.data(valid,
                         shape = "wide",
                         choice = "Choice",
                         sep = "",
                         varying = c(4:83),
                         alt.levels = c("Ch1", "Ch2", "Ch3", "Ch4"),
                         id.var = "Case")
  out$valid <- M.valid
  
  # process training set
  M.test <- mlogit.data(test,
                        shape = "wide",
                        choice = "Choice",
                        sep = "",
                        varying = c(4:83),
                        alt.levels = c("Ch1", "Ch2", "Ch3", "Ch4"),
                        id.var = "Case")
  out$test <- M.test
  
  # return processed dataframes
  return(out)
}

## fit mlogit model (with cross-validation)
mnlogit.fit <- function(data) {
  
  # config parallel processing
  #cluster <- makeCluster(detectCores() - 1)
  
  # initialise inner-model to use for cross-validation
  control <- trainControl(method="cv",
                          number=10,
                          savePredictions="final",
                          summaryFunction=mnLogLoss,
                          classProbs=T,
                          allowParallel=T)
  
  # specify parameter grid to compute over
  grid <- expand.grid(cp=seq(0, 0.002, 0.0001)) 
  
  # train the model (WARNING: LONG COMPUTATION TIME)
  #registerDoParallel(cluster)
#   M <- mlogit(Choice~NS+GN+FA+LD+BZ+FP+RP+PP+KA+SC+TS+MA+LB+HU+Price-1 | 
#                    segment+night+region+miles+age+income+ppark,
#                    R=100,
#                    panel=F,
#                    data=data)
  # loss_train (mlogit):  1.164466 
  # loss_valid (mlogit):  1.129114 
  M <- mlogit(Choice~CC+GN+NS+BU+FA+LD+BZ+FP+RP+PP+KA+SC+TS+MA+LB+HU+Price-1 | 
              segment+night+region+miles+age | 
              0,
              rpar=c(CC='n', GN='n', FP='n', RP='n', PP='n', LB='n', Price='t'),
              #reflevel = "Ch1",
              R = 100,
              #halton = NA,
              #correlation = T,
              data = data,
              panel = T,
              print.level = F)
  # loss_train (mlogit):  1.19749 
  # loss_valid (mlogit):  1.153899 
  #stopCluster(cluster)
    
  #registerDoSEQ()
  
  # return fitted model
  return(M)
}

## predict using fitted mlogit model
mnlogit.predict <- function(M, D, prob) {
  
  # get prediction accuracy on training set
  P.train <- predict(M, D$train)
  C.train <- apply(P.train, 1, which.max)
  T.train <- table(C.train, actual.train) # confusion matrix, not really needed here
  # sum(diag(T2.train))/NROW(data.train)
  cat("loss_train (mlogit): ", logLoss(P.train, actual.train), "\n")
  C.train <- ifelse(C.train==1, "Ch1",
                ifelse(C.train==2, "Ch2",
                       ifelse(C.train==3, "Ch3",
                              ifelse(C.train==4, "Ch4", 0))))
  
  # get prediction accuracy on validation set
  P.valid <- predict(M, D$valid)
  C.valid <- apply(P.valid, 1, which.max)
  T.valid <- table(C.valid, actual.valid) # confusion matrix, not really needed here
  # sum(diag(T2.valid))/NROW(data.valid)
  cat("loss_valid (mlogit): ", logLoss(P.valid, actual.valid), "\n")
  
  # get prediction on test set
  D$test[,c("Ch1", "Ch2", "Ch3", "Ch4")] <- 1
  D$test[,c("Choice")] <- T
  P.test <- predict(M, D$test)
  C.test <- apply(P.test, 1, which.max)
  
  # depending on `prob` input parameter, return either the probabilites or the choice predicted
  if (prob) {
    return(P.test)
  } else {
    return(C.train)
  }
}

## wrapper function
mnlogit <- function(prob=T) {
  D <- mnlogit.preprocess(data.train, data.valid, data.test)
  M <- mnlogit.fit(D$train)
  return(mnlogit.predict(M, D, prob))
}


## DEBUGGING
D <- mnlogit.preprocess(data.train, data.valid, data.test)
M <- mnlogit.fit(D$train)
P <- mnlogit.predict(M, D, prob=T)
# loss_train (mlogit):  1.181399 
# loss_valid (mlogit):  1.151575 


### sad attempt at using CART

In [None]:
# method, tuneGrid, metric

## fit CART model (with cross-validation)
cart.fit <- function(x, y) {
    
    # config parallel processing
    cluster <- makeCluster(detectCores() - 1)
    
    # initialise CART inner-model to use for cross-validation
    control <- trainControl(method="cv",
                            number=10,
                            savePredictions="final",
                            summaryFunction=mnLogLoss,
                            classProbs=T,
                            allowParallel=T)

    # specify parameter grid to compute over (e.g. complexity parameter for CART)
    grid <- expand.grid(cp=seq(0, 0.001, 0.00005)) 

    # train the model (WARNING: LONG COMPUTATION TIME)
    registerDoParallel(cluster)
    #set.seed(1)
    M <- train(x=x, y=y,
                trControl=control,
                tuneGrid=grid,
                method="rpart",
                metric="logLoss",
                preProc=c("center", "scale"))
    stopCluster(cluster)
    registerDoSEQ()
    
    # return fitted model
    return(M)
}

## predict using fitted CART model
cart.predict <- function(M, prob=T) {

    # get prediction accuracy on training set
    P.train <- predict(M, data.train, type="prob")
    C.train <- predict(M, data.train, type="raw")
    T.train <- table(C.train, actual.train) # confusion matrix, not really needed here
    # sum(diag(T2.train))/NROW(data.train)
    cat("loss_train (CART): ", logLoss(P.train, actual.train), "\n")

    # get prediction accuracy on validation set
    P.valid <- predict(M, data.valid, type="prob")
    C.valid <- predict(M, data.valid, type="raw")
    T.valid <- table(C.valid, actual.valid) # confusion matrix, not really needed here
    # sum(diag(T2.valid))/NROW(data.valid)
    cat("loss_valid (CART): ", logLoss(P.valid, actual.valid), "\n")

    # get prediction on test set
    P.test <- predict(M, data.test, type="prob")
    C.test <- predict(M, data.test, type="raw")
    
    # depending on `prob` input parameter, return either the probabilites or the choice predicted
    if (prob) {
        return(P.test)
    } else {
        return(C.train)
    }
}

## wrapper function
cart <- function(x=data.train[,c(4:81, 83:94)], y=data.train[,c(99)], prob=T) {
    M <- cart.fit(x, y)
    return(cart.predict(M, prob))
}

## DEBUGGING
# M2 <- cart.fit(x=data.train[,c(4:81, 83:94)], y=data.train[,c(99)])
# P2 <- cart.predict(M)

### visualising cp vs logloss

In [None]:
# draw plot to see how the `mnLogLoss` changes over the various `cp` values
options(repr.plot.width=9, repr.plot.height=4) # resize IRkernel plot size
plot(M2$results$cp, M2$results$logLoss,
     xlab="cp",
     ylab="mnLogLoss")

# get (index, cp) that gives lowest `mnLogLoss`
M2$bestTune



# options(repr.plot.width=9, repr.plot.height=7) # resize IRkernel plot size
# prp(M2$finalModel, extra=4, type=4, branch=0)

### sad attempt at using random forests

In [112]:
## fit randomforest model (need cross-validation or not??? i dunno)
## (clement) random forest model itself is a form of cross validation isnt it? so no need to do CV
rf.fit <- function(x, y) {
    
    # config parallel processing
    cluster <- makeCluster(35)
    
    # initialise rf inner-model to use for cross-validation
    control <- trainControl(method="cv",
                            number=10,
                            savePredictions="final",
                            summaryFunction=mnLogLoss,
                            classProbs=T,
                            allowParallel=T)
    
    # specify parameter grid to compute over
    grid <- expand.grid(mtry=seq(10, 20, 1)) 

    # tune `mtry` variable
    registerDoParallel(cluster)
    #M <- tuneRF(x, y, stepFactor=1.2, improve=1e-5, ntreeTry=2000, doBest=T, mtryStart=10)
    M <- train(x=x, y=y,
                 trControl=control,
                 tuneGrid=grid,
                 method="rf",
                 nodesize=25,
                 metric="logLoss",
                 ntree=2000,
                 preProc=c("center", "scale"))
#     M <- randomForest(x, y, ntree=1000)
    stopCluster(cluster)
    registerDoSEQ()
    
    # return model with best-fitted `mtry` variable
    return(M)
}

## predict using fitted randomforest model
rf.predict <- function(M, prob) {
    
    # get prediction accuracy on train set
    P.train <- predict(M, data.train, type="prob")
    C.train <- predict(M, data.train)
    T.train <- table(C.train, actual.train) # confusion matrix, not really needed here
    cat("acc_train (randomforest): ", sum(diag(T.train))/NROW(data.train), "\n")
    cat("loss_train (randomForest): ", logLoss(P.train, actual.train), "\n")

    # # get prediction accuracy on validation set
    P.valid <- predict(M, data.valid, type="prob")
    C.valid <- predict(M, data.valid)
    T.valid <- table(C.valid, actual.valid) # confusion matrix, not really needed here
    cat("acc_valid (randomforest): ", sum(diag(T.valid))/NROW(data.valid), "\n")
    cat("loss_valid (randomForest): ", logLoss(P.valid, actual.valid), "\n\n")

    # get prediction accuracy on test set
    P.test <- predict(M, data.test, type="prob")
    C.test <- predict(M, data.test)

    # depending on `prob` input parameter, return either the probabilites or the choice predicted
    if (prob) {
        return(P.test)
    } else {
        return(C.train)
    }
    
}

## wrapper function for randomforest
rf <- function(x=data.train[,c(4:63, 84:94)], y=data.train[,c(99)], prob=T) {
    M <- rf.fit(x, y)
    return(rf.predict(M, prob))
}


## DEBUGGING
## [MODEL_OR_PREDICT].[DATASET_SIZE].[SPLIT_FACTOR].[#_OF_FACTORS]
# M.final <- rf.fit(x=data.train.small[,c(4:63, 84:94)], y=data.train.small[,c(99)])
# M.final$bestTune
# P.final <- rf.predict(M.final, T)



# write.csv(P.large.task.allBut4, paste("RF", date()), row.names = F, col.names=F)

In [None]:
## SPLITTING BY TASK, EXCLUDE ALTERNATIVE 4 VARS
M.large.task.allBut4 <- rf.fit(x=data.train[,c(4:63, 84:94)], y=data.train[,c("Choice")])
P.large.task.allBut4 <- rf.predict(M.large.task.allBut4, prob=T)
# str(M.small.task.allBut4$finalModel)
# options(repr.plot.width=8, repr.plot.height=15) # resize IRkernel plot size
# vu <- varUsed(M.small.task.allBut4$finalModel, count=T)
# vuSorted <- sort(vu, decreasing=F, index.return=T)
# get number of times variable was used
# dotchart(vuSorted$x, names(M.small.task.allBut4$finalModel$forest$xlevels[vuSorted$ix]))
# get importance of variable in reducing impurity
# varImpPlot(M.small.task.allBut4$finalModel, n.var=nrow(M.small.task.allBut4$finalModel$importance))
# acc_train (randomforest):  1 
# loss_train (randomForest):  0.2860298 
# acc_valid (randomforest):  0.548381 
# loss_valid (randomForest):  1.144018

## SPLITTING BY TASK, INCLUDING ALL FACTORS
# M.small.task.all <- rf.fit(x=data.train.small[,c(4:94)], y=data.train.small[,c("Choice")])
# P.small.task.all <- rf.predict(M.small.task.all, prob=T)
# str(M.small.task.all$finalModel)
# options(repr.plot.width=8, repr.plot.height=15) # resize IRkernel plot size
# vu <- varUsed(M.small.task.all$finalModel, count=T)
# vuSorted <- sort(vu, decreasing=F, index.return=T)
# get number of times variable was used
# dotchart(vuSorted$x, names(M.small.task.all$finalModel$forest$xlevels[vuSorted$ix]))
# get importance of variable in reducing impurity
# varImpPlot(M.small.task.all$finalModel, n.var=nrow(M.small.task.all$finalModel$importance))
# acc_train (randomforest):  1 
# loss_train (randomForest):  0.2860298 
# acc_valid (randomforest):  0.548381 
# loss_valid (randomForest):  1.144018

# M.large.task.all <- rf.fit(x=data.train[,c(4:94)], y=data.train[,c("Choice")])
# P.large.task.all <- rf.predict(M.large.task.all, prob=T)
# acc_train (randomforest):  1 
# loss_train (randomForest):  0.2769319 
# acc_valid (randomforest):  1 
# loss_valid (randomForest):  0.2720686 

## SPLITTING BY CASE
# M.small.case.all <- rf.fit(x=data.train.small[,c(4:94)], y=data.train.small[,c("Choice")])
# P.small.case.all <- rf.predict.small(M.small.case.all, prob=T)
# acc_train (randomforest):  1 
# loss_train (randomForest):  0.2794337 
# acc_valid (randomforest):  0.5024561 
# loss_valid (randomForest):  1.197454 

In [114]:
# M.rf <- randomForest(x=data.train[,c("income", "miles", "year", "night",
#                                      "Price1", "Price2", "Price3", "segment",
#                                      "region", "age", "ppark", "educ",
#                                      "BU1", "BU2", "BU3", "NS1",
#                                      "NS2", "NS3")],
#                      y=data.train[,c("Choice")],
#                      ntree=1500,
#                      mtry=)

# cluster <- makeCluster(35)
# registerDoParallel(cluster)

# M.10.25 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=25, ntree=2000, mtry = 10)

# M.10.45 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=45, ntree=2000, mtry = 10)

# M.15.25 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=25, ntree=2000, mtry = 15)


# M.15.25.mod <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small.mod, nodesize=25, ntree=2000, mtry = 15)


M.mod <- randomForest(as.factor(Choice) ~ income + segment + year + miles + night + gender + age + educ + region + Urb + ppark
                       +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
                       +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
                       +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
                       , data=data.train.mod, nodesize=25, ntree=2000, mtry = 15)

# M.15.45 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=45, ntree=2000, mtry = 15)

# M.15.28 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=28, ntree=2000, mtry = 15)

# M.15.25.full <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train, nodesize=25, ntree=2000, mtry = 15)

# M.15.45 <- randomForest(as.factor(Choice) ~ segment + year + miles + night + gender + age + educ + region + Urb + ppark
#                        +CC1+GN1+NS1+BU1+FA1+LD1+BZ1+FC1+FP1+RP1+PP1+KA1+SC1+TS1+NV1+MA1+LB1+AF1+HU1+Price1
#                        +CC2+GN2+NS2+BU2+FA2+LD2+BZ2+FC2+FP2+RP2+PP2+KA2+SC2+TS2+NV2+MA2+LB2+AF2+HU2+Price2
#                        +CC3+GN3+NS3+BU3+FA3+LD3+BZ3+FC3+FP3+RP3+PP3+KA3+SC3+TS3+NV3+MA3+LB3+AF3+HU3+Price3
#                        , data=data.train.small, nodesize=45, ntree=2000, mtry = 15)

# cluster <- makeCluster(35)
# registerDoParallel(cluster)
# M.tune <- tuneRF(data.train.small[,c(84:92, 94)], data.train.small[,c("Choice")],
#                  stepFactor=1.2, improve=1e-5, ntreeTry=1500, doBest=T, mtryStart=10)
# stopCluster(cluster)
# registerDoSEQ()




# P.10.25 <- rf.predict(M.10.25, prob=T)
# P.10.45 <- rf.predict(M.10.45, prob=T)
# P.15.25 <- rf.predict(M.15.25, prob=T)
# P.15.25.mod <- rf.predict(M.15.25.mod, prob=T)
# P.15.45 <- rf.predict(M.15.45, prob=T)

P.mod <- rf.predict(M.mod, prob=T)


## 25/10/small
# acc_train (randomforest):  0.695335 
# loss_train (randomForest):  0.8608042 
# acc_valid (randomforest):  0.5005254 
# loss_valid (randomForest):  1.165004 
## 45/10/small
# acc_train (randomforest):  0.6129078 
# loss_train (randomForest):  0.9432376 
# acc_valid (randomforest):  0.5022767 
# loss_valid (randomForest):  1.16119 
## 60/10/small
# acc_train (randomforest):  0.5833743 
# loss_train (randomForest):  0.9826437 
# acc_valid (randomforest):  0.5015762 
# loss_valid (randomForest):  1.163753 


acc_train (randomforest):  0.6842105 
loss_train (randomForest):  0.8392076 
acc_valid (randomforest):  0.4975439 
loss_valid (randomForest):  1.157477 

acc_train (randomforest):  0.7370526 
loss_train (randomForest):  0.765638 
acc_valid (randomforest):  0.4978947 
loss_valid (randomForest):  1.157089 

acc_train (randomforest):  0.8123509 
loss_train (randomForest):  0.6427379 
acc_valid (randomforest):  0.8315789 
loss_valid (randomForest):  0.615116 



ERROR: Error in acc_train(randomforest): could not find function "acc_train"


In [99]:
write.csv(P.15.25.full, "final2")

In [None]:
options(repr.plot.width=8, repr.plot.height=15) # resize IRkernel plot size
# vu <- varUsed(M.10.inc, count=T)
# vuSorted <- sort(vu, decreasing=F, index.return=T)
# # get number of times variable was used
# dotchart(vuSorted$x, names(M.rf$forest$xlevels[vuSorted$ix]))
# # get importance of variable in reducing impurity
varImpPlot(M.mod, n.var=nrow(M.mod$importance))

In [11]:
# fn <- getModelInfo("rfRules")[[1]]$grid
# str(data.train[,c(4:63, 84:94)])
param <- getModelInfo("rf")[[1]]$parameters
param <- rbind(param, data.frame(parameter="nodesize", class="numeric", label="Nodesize"))
fn <- function (x, y, len = NULL, search = "grid") {
    
    if (search == "grid") {
        out <- data.frame(mtry = caret::var_seq(p = ncol(x), 
            classification = is.factor(y), len = len), nodesize = seq(20, 60, 5))
    }
    else {
        out <- data.frame(mtry = sample(1:ncol(x), size = len, 
            replace = TRUE), nodesize = sample(1:15, size = len, 
            replace = TRUE))
    }
    out[!duplicated(out), ]
}

rfCustom <- getModelInfo("rf")[[1]]
rfCustom$grid <- fn
rfCustom$parameters <- param
# rfCustom$fit

In [84]:
seq(20, 60, 5)

In [None]:
training <- data.train[,c("CC1", "CC2", "CC3", "CC4",
              "FP1", "FP2", "FP3", "FP4",
              "RP1", "RP2", "RP3", "RP4",
              "PP1", "PP2", "PP3", "PP4",
              "KA1", "KA2", "KA3", "KA4",
              "SC1", "SC2", "SC3", "SC4",
              "LB1", "LB2", "LB3", "LB4",
              "segment", "night", "miles", "region", "ppark")]
valid <- data.valid[,c("CC1", "CC2", "CC3", "CC4",
              "FP1", "FP2", "FP3", "FP4",
              "RP1", "RP2", "RP3", "RP4",
              "PP1", "PP2", "PP3", "PP4",
              "KA1", "KA2", "KA3", "KA4",
              "SC1", "SC2", "SC3", "SC4",
              "LB1", "LB2", "LB3", "LB4",
              "segment", "night", "miles", "region", "ppark")]
test <- data.test[,c("CC1", "CC2", "CC3", "CC4",
              "FP1", "FP2", "FP3", "FP4",
              "RP1", "RP2", "RP3", "RP4",
              "PP1", "PP2", "PP3", "PP4",
              "KA1", "KA2", "KA3", "KA4",
              "SC1", "SC2", "SC3", "SC4",
              "LB1", "LB2", "LB3", "LB4",
              "segment", "night", "miles", "region", "ppark")]
training.small <- data.train.small[,c("CC1", "CC2", "CC3", "CC4",
              "FP1", "FP2", "FP3", "FP4",
              "RP1", "RP2", "RP3", "RP4",
              "PP1", "PP2", "PP3", "PP4",
              "KA1", "KA2", "KA3", "KA4",
              "SC1", "SC2", "SC3", "SC4",
              "LB1", "LB2", "LB3", "LB4",
              "segment", "night", "miles", "region", "ppark")]

# P.small.task.all <- rf.predict.small(M.small.task.all, prob=T)

In [8]:
rf.predict.small <- function(M, prob) {
    
    # get prediction accuracy on train set
    P.train <- predict(M, data.train.small, type="prob")
    C.train <- predict(M, data.train.small)
    T.train <- table(C.train, actual.train.small) # confusion matrix, not really needed here
    cat("acc_train (randomforest): ", sum(diag(T.train))/NROW(data.train.small), "\n")
    cat("loss_train (randomForest): ", logLoss(P.train, actual.train.small), "\n")

    # # get prediction accuracy on validation set
    P.valid <- predict(M, data.valid, type="prob")
    C.valid <- predict(M, data.valid)
    T.valid <- table(C.valid, actual.valid) # confusion matrix, not really needed here
    cat("acc_valid (randomforest): ", sum(diag(T.valid))/NROW(data.valid), "\n")
    cat("loss_valid (randomForest): ", logLoss(P.valid, actual.valid), "\n")

    # get prediction accuracy on test set
    P.test <- predict(M, data.test, type="prob")
    C.test <- predict(M, data.test)

    # depending on `prob` input parameter, return either the probabilites or the choice predicted
    if (prob) {
        return(P.test)
    } else {
        return(C.train)
    }
    
}

# P.small <- rf.predict.small(M.small, prob=T)

# acc_train (randomforest):  1 
# loss_train (randomForest):  0.2860298 
# acc_valid (randomforest):  0.548381 
# loss_valid (randomForest):  1.144018 

### sad attempt at using gradient-boosting trees

In [None]:
## preprocess data for mlogit model
xgb.preprocess <- function(train, valid, test) {
    
    # list to store processed dataframes
    out <- list()
    
    # setting dataframes as datatables
    table.train <- as.data.table(train)
    table.valid <- as.data.table(valid)
    table.test <- as.data.table(test)

    # rename last col to numbers
    nameLastCol <- names(train)[ncol(train)]
    y.train <- table.train[, nameLastCol, with = F][[1]] %>% gsub('Ch', '', .) %>% {as.integer(.)-1}
    y.valid <- table.train[, nameLastCol, with = F][[1]] %>% gsub('Ch', '', .) %>% {as.integer(.)-1}
    y.test <- table.train[, nameLastCol, with = F][[1]] %>% gsub('Ch', '', .) %>% {as.integer(.)-1}
    out$y.train <- y.train
    out$y.valid <- y.valid
    out$y.test <- y.test
#     out$y.train <- table.train[,99]
#     out$y.valid <- table.valid[,99]
#     out$y.test <- table.test[,99]

    # preprocessing training dataset
#     table.train <- table.train[,c(4:94)]
    table.train <- table.train[,c(4:84, 86:94)]
#     table.train <- as.matrix(table.train)
    table.train <- table.train[,lapply(.SD,as.numeric)] %>% as.matrix
    out$table.train <- table.train

    # preprocessing validation dataset
#     table.valid <- table.valid[,c(4:94)]
    table.valid <- table.valid[,c(4:84, 86:94)]
#     table.valid <- as.matrix(table.valid)
    table.valid <- table.valid[,lapply(.SD,as.numeric)] %>% as.matrix
    out$table.valid <- table.valid

    # preprocessing test dataset
#     table.test <- table.test[,c(4:94)]
    table.test <- table.test[,c(4:84, 86:94)]
#     table.test <- as.matrix(table.test)
    table.test <- table.test[,lapply(.SD,as.numeric)] %>% as.matrix
    out$table.test <- table.test
    
    # return processed data
    return(out)
    
}

## fit gradient boosting model (with cross-validation)
xgb.fit <- function(D) {
    
    # config parallel processing
    cluster <- makeCluster(4 - 1)
    
    xgbCust <- getModelInfo("xgbTree")$xgbTree
    params <- data.frame(parameter=c("objective","eval_metric","num_class"),
                     class=c("character","character","numeric"),
                     label=c("Objective function", "Metric", "Classes"))
    names(params) <- c("parameter", "class", "label")
    xgbCust$parameters <- rbind(xgbCust$parameters, params)
    
    # initialise CART inner-model to use for cross-validation
    control <- trainControl(method="cv",
                            number=10,
                            #savePredictions="final",
                            #returnResamp="final",
                            #summaryFunction=mnLogLoss,
                            #classProbs=T,
                            allowParallel=T)

    # specify parameter grid to compute over (e.g. complexity parameter for CART)
#     grid <- expand.grid(nrounds=seq(10, 10, 1) ,
#                         lambda=seq(0, 0, 1) ,
#                         alpha=seq(0, 0, 1) ,
#                         eta=seq(0.3, 0.3, 0.05),
#                         objective="multi:softprob",
#                         eval_metric="mlogloss",
#                         num_class=4)
    grid <- expand.grid(nrounds=seq(20, 50, 1),
                        max_depth=seq(89, 89, 1),
                        eta=seq(0.1, 0.4, 0.05),
                        gamma=seq(0, 0, 1),
                        colsample_bytree=seq(1, 1, 1),
                        min_child_weight=seq(1, 1, 1),
                        subsample=seq(0.7, 1, 0.1))
#                         objective="multi:softprob",
#                         eval_metric="mlogloss",
#                         num_class=4)
    
    # train the model (WARNING: LONG COMPUTATION TIME)
    registerDoParallel(cluster)
#     M <- xgboost(param=param,
#                  data=x,
#                  label=y,
#                  nrounds=cv.nround)
    M <- train(x=D$table.train, y=D$y.train,
               trControl=control,
               tuneGrid=grid,
               #method="xgbLinear",
               #method=xgbCust,
               method="xgbTree",
               objective="multi:softprob",
               eval_metric="mlogloss",
               num_class=4,
               #metric="logLoss",
               preProc=c("center", "scale"))
    stopCluster(cluster)
    registerDoSEQ()
    
    # return fitted model
    return(M)
}

## predict using fitted randomforest model
xgb.predict <- function(M, D, prob) {
    
    # get prediction accuracy on train set
    P.train <- matrix(predict(M, D$table.train), ncol = 4, byrow = T)
    C.train <- max.col(P.train)
    logLoss(P.train, actual.train)

    # get prediction accuracy on validation set
    P.valid <- matrix(predict(M, D$table.valid), ncol = 4, byrow = T)
    C.valid <- max.col(P.train)
    logLoss(P.valid, actual.valid)

    # get prediction accuracy on test set
    P.test <- matrix(predict(M, D$table.test), ncol = 4, byrow = T)
    C.test <- max.col(P.train)
    # logLoss(P4.valid, actual.valid)

    # depending on `prob` input parameter, return either the probabilites or the choice predicted
    if (prob) {
        return(P.test)
    } else {
        return(C.train)
    }
    
}

## wrapper function for randomforest
xgb <- function(train=data.train, valid=data.valid, test=data.test,  prob=T) {
    D <- xgb.preprocess(train, valid, test)
    M <- xgb.fit(D)
    return(xgb.predict(M, D, prob))
}

## DEBUGGING
D <- xgb.preprocess(data.train, data.valid, data.test)
# M <- suppressWarnings(xgb.fit(D))
M <- xgb.fit(D)

In [None]:
numberOfClasses <-4

param <- list("objective" = "multi:softprob",
              "eval_metric" = "mlogloss",
              "num_class" = 4)
#               "max_depth" = 6,
#               "eta" = 0.3)

cv.nround <- 47
cv.nfold <- 10

D <- xgb.preprocess(data.train, data.valid, data.test)
# M4 <- xgboost(param=param, data=D$table.train, label=D$y.train, nrounds=cv.nround)
M4 <- xgb.cv(param=param, data=D$table.train, label=D$y.train, nrounds=cv.nround, nfold=cv.nfold)

In [None]:
# get prediction accuracy on train set
P4.train <- matrix(predict(M, D$table.train), ncol = 4, byrow = T)
C4.train <- max.col(P4.train)
T4.train <- cbind(C4.train, actual.train)
NROW(subset(T4.train, T4.train[,1] == T4.train[,2]))/NROW(T4.train)
logLoss(P4.train, actual.train)

# get prediction accuracy on validation set
P4.valid <- matrix(predict(M, D$table.valid), ncol = 4, byrow = T)
C4.valid <- max.col(P4.valid)
T4.valid <- cbind(C4.valid, actual.valid)
NROW(subset(T4.valid, T4.valid[,1] == T4.valid[,2]))/NROW(T4.valid)
logLoss(P4.valid, actual.valid)

# get prediction accuracy on test set
P4.test <- matrix(predict(M, D$table.test), ncol = 4, byrow = T)
C4.test <- max.col(P4.train)

In [None]:
?createFolds

### experimental stuff

In [None]:
# a pathetic attempt at selecting relevant features
featureSelect1 <- function(data) {
    
    try.train <- data
    # for (i in c(4:22, 24:42)) {
    #     try.train[,i] <- ifelse(try.train[,i]>0, 1, 0)
    # }

    # try.train[,c(4:22)]
    # try.train[,c(24:42)]
    # try.train[,c(44:62)]
    # try.train[,c(64:82)]

    try.train$Alt1 <- rowSums(try.train[,c(4:22)])
    try.train$Ratio1 <- try.train$Alt1/try.train$Price1
    try.train$Alt2 <- rowSums(try.train[,c(24:42)])
    try.train$Ratio2 <- try.train$Alt2/try.train$Price2
    try.train$Alt3 <- rowSums(try.train[,c(44:62)])
    try.train$Ratio3 <- try.train$Alt3/try.train$Price3
    try.train$Alt4 <- rowSums(try.train[,c(64:82)])
    try.train$Ratio4 <- 0
    
    return(try.train[,c(84, 86:90, 92:94, 101, 103, 105, 107, 99)])
}

# another pathetic attempt at selecting relevant features
featureSelect2 <- function(data) {
    
    try.train <- data
    for (i in c(4:22)) {
        try.train[,i] <- try.train[,i]/try.train[23]
    }
    for (i in c(24:42)) {
        try.train[,i] <- try.train[,i]/try.train[43]
    }
    for (i in c(44:62)) {
        try.train[,i] <- try.train[,i]/try.train[63]
    }
    for (i in c(64:82)) {
        try.train[,i] <- try.train[,i]/try.train[83]
    }

    # try.train[,c(4:22)]
    # try.train[,c(24:42)]
    # try.train[,c(44:62)]
    # try.train[,c(64:82)]

#     try.train$Alt1 <- rowSums(try.train[,c(4:22)])
#     try.train$Ratio1 <- try.train$Alt1/try.train$Price1
#     try.train$Alt2 <- rowSums(try.train[,c(24:42)])
#     try.train$Ratio2 <- try.train$Alt2/try.train$Price2
#     try.train$Alt3 <- rowSums(try.train[,c(44:62)])
#     try.train$Ratio3 <- try.train$Alt3/try.train$Price3
#     try.train$Alt4 <- rowSums(try.train[,c(64:82)])
#     try.train$Ratio4 <- 0
    
    return(try.train[,c(84, 86:90, 92:94, 101, 103, 105, 107, 99)])
}

try.train <- featureSelect(data.train)
try.valid <- featureSelect(data.valid)
try.test <- featureSelect(data.test)
str(try.train)

In [None]:
try.train.mlogit <- mlogit.data(try.train,
                         shape = "wide",
                         choice = "Choice")
#                          sep = "",
#                          varying = c(4:83),
#                          alt.levels = c("Ch1", "Ch2", "Ch3", "Ch4"),
#                          id.var = "Case")

# try.train.mlogit

rf.expt <- randomForest(Choice~segment+miles+night+gender+age+educ+Urb+income+ppark+Ratio1+Ratio2+Ratio3+Ratio4,
                        data=try.train,
                        mtry=13,
                        ntree=1000)

# rf.expt <- tuneRF(try.train[,c(1:13)], try.train[,c(14)], stepFactor=1.1, improve=1e-5, ntree=1000, doBest=T)

In [None]:
rf.expt.p <- predict(rf.expt, try.valid, type="response")
pred <- cbind(rf.expt.p, try.valid[,"Choice"])
# pred
NROW(subset(pred, pred[,1]==pred[,2]))/NROW(pred)

In [None]:
# preprocessing to make all columns output same format choices
M3.train.choice <- ifelse(C3.train=="Ch1", 1, ifelse(C3.train=="Ch2", 2, ifelse(C3.train=="Ch3", 3, ifelse(C3.train=="Ch4", 4, 0))))
M3.valid.choice <- ifelse(C3.valid=="Ch1", 1, ifelse(C3.valid=="Ch2", 2, ifelse(C3.valid=="Ch3", 3, ifelse(C3.valid=="Ch4", 4, 0))))
M3.test.choice <- ifelse(C3.test=="Ch1", 1, ifelse(C3.test=="Ch2", 2, ifelse(C3.test=="Ch3", 3, ifelse(C3.test=="Ch4", 4, 0))))

# get meta dataframe for model stacking
U1.train.features <- data.frame(M1=XGB.train.choices, M3=M3.train.choice)
U1.train <- cbind(train, U1.train.features)
U1.train <- U1.train[,c("Choice", "M1", "M3")]
# U1.train$Choice <- ifelse(U1.train$Choice=="Ch1", 1,
#                           ifelse(U1.train$Choice=="Ch2", 2,
#                                  ifelse(U1.train$Choice=="Ch3", 3,
#                                         ifelse(U1.train$Choice=="Ch4", 4, 0))))

U1.valid.features <- data.frame(M1=XGB.valid.choices, M3=M3.valid.choice)
U1.valid <- cbind(valid, U1.valid.features)
U1.valid <- U1.valid[,c("Choice", "M1", "M3")]
U1.valid$Choice <- ifelse(U1.valid$Choice=="Ch1", 1,
                          ifelse(U1.valid$Choice=="Ch2", 2,
                                 ifelse(U1.valid$Choice=="Ch3", 3,
                                        ifelse(U1.valid$Choice=="Ch4", 4, 0))))

U1.test.features <- data.frame(Choice=factor(1, levels=c(1,2,3,4)), M1=XGB.test.choices, M3=M3.test.choice)
# U1.test <- cbind(factor(1, levels=c(1,2,3,4)), U1.test.features)
# U1.test <- U1.test[,c("Choice", "M1", "M3")]
# U1.test$Choice <- ifelse(U1.test$Choice=="Ch1", 1,
#                           ifelse(U1.test$Choice=="Ch2", 2,
#                                  ifelse(U1.test$Choice=="Ch3", 3,
#                                         ifelse(U1.test$Choice=="Ch4", 4, 0))))
U1.test$Choice <- factor(1, levels=c(1,2,3,4))
U1.test
# str(U1.test)
# str(U1.train)
# subset(U1.test, is.na(U1.test)==T)
# which(is.na(U1.test))
# test[16398:16416,]

# fit new model to U1, using [M1,M2,M3] as features
library(LiblineaR)
# ?LiblineaR
U1 <- LiblineaR(data=U1.train[,2:3], target=U1.train[,1], type=6, cost=100)
# U1 <- LiblineaR(data=U1.train[,c(2:4)], target=U1.train[,1], type=6, cost=100)
# summary(U1)
# UP1.valid <- predict(U1, U1.valid, proba=T)
# UP1.test <- predict(U1, U1.test, proba=T)
# UP1.valid
# UP1.test
# UP1.test$probabilities


# initialise CART inner-model to use for cross-validation
# control <- trainControl(method="cv", number=10, savePredictions="final")

# # specify parameter grid to compute over (e.g. complexity parameter for CART)
# grid <- expand.grid(cp=seq(0, 0.005, 0.0001)) 

# # train the model (WARNING: LONG COMPUTATION TIME)
# set.seed(1)
# M2 <- train(Choice~.-Ch1-Ch2-Ch3-Ch4-1, data=train, trControl=control, tuneGrid=grid, method="rpart")