diff --git a/code.R b/code.R new file mode 100644 index 0000000..028a694 --- /dev/null +++ b/code.R @@ -0,0 +1,303 @@ +################################################################### +## Code for Workshop 4: Predictive Modeling on Data with Severe +## Class Imbalance: Applications on Electronic Health Records. +## The course was conducted for the International Conference on +## Health Policy Statistics (ICHPS) on Wed, Oct 7, from +## 10:15 AM - 12:15 PM. + +################################################################### +## Example Data + +load("emr.RData") + +################################################################### +## Training/Test Split + +library(caret) + +set.seed(1732) +in_train <- createDataPartition(emr$Class, p = .75, list = FALSE) +training <- emr[ in_train,] +testing <- emr[-in_train,] + +mean(training$Class == "event") +mean(testing$Class == "event") + +table(training$Class) +table(testing$Class) + +################################################################### +## Overfitting to the Majority Class + +library(partykit) +library(rpart) + +rpart_small <- rpart(Class ~ ., data = training, + control = rpart.control(cp = 0.0062)) + +plot(as.party(rpart_small)) + +################################################################### +## Subsampling for class imbalances + +## Define the resampling method and how we calculate performance + +ctrl <- trainControl(method = "repeatedcv", + repeats = 5, + classProbs = TRUE, + savePredictions = TRUE, + summaryFunction = twoClassSummary) + +## Tune random forest models over this grid +mtry_grid <- data.frame(mtry = c(1:15, (4:9)*5)) + +################################################################### +## The basic random forest model with no adaptations + +set.seed(1537) +rf_mod <- train(Class ~ ., + data = training, + method = "rf", + metric = "ROC", + tuneGrid = mtry_grid, + ntree = 1000, + trControl = ctrl) + +################################################################### +## This function is used to take the out of sample predictions and +## create an approximate ROC curve from them + +roc_train <- function(object, best_only = TRUE, ...) { + caret:::requireNamespaceQuietStop("pROC") + caret:::requireNamespaceQuietStop("plyr") + + if(object$modelType != "Classification") + stop("ROC curves are only availible for classification models") + if(!any(names(object$modelInfo) == "levels")) + stop(paste("The model's code is required to have a 'levels' module.", + "See http://topepo.github.io/caret/custom_models.html#Components")) + lvs <- object$modelInfo$levels(object$finalModel) + if(length(lvs) != 2) + stop("ROC curves are only implemented here for two class problems") + + ## check for predictions + if(is.null(object$pred)) + stop(paste("The out of sample predictions are required.", + "See the `savePredictions` argument of `trainControl`")) + + if(best_only) { + object$pred <- merge(object$pred, object$bestTune) + } + ## find tuning parameter names + p_names <- as.character(object$modelInfo$parameters$parameter) + p_combos <- object$pred[, p_names, drop = FALSE] + + ## average probabilities across resamples + object$pred <- plyr::ddply(.data = object$pred, + .variables = c("obs", "rowIndex", p_names), + .fun = function(dat, lvls = lvs) { + out <- mean(dat[, lvls[1]]) + names(out) <- lvls[1] + out + }) + + make_roc <- function(x, lvls = lvs, nms = NULL, ...) { + out <- pROC::roc(response = x$obs, + predictor = x[, lvls[1]], + levels = rev(lvls)) + + out$model_param <- x[1,nms,drop = FALSE] + out + } + out <- plyr::dlply(.data = object$pred, + .variables = p_names, + .fun = make_roc, + lvls = lvs, + nms = p_names) + if(length(out) == 1) out <- out[[1]] + out +} + +################################################################### +## Some plots of the data + +ggplot(rf_mod) + +plot(roc_train(rf_mod), + legacy.axes = TRUE, + print.thres = .5, + print.thres.pattern=" <- default %.1f threshold") + +plot(roc_train(rf_mod), + legacy.axes = TRUE, + print.thres.pattern = "Cutoff: %.2f (Sp = %.2f, Sn = %.2f)", + print.thres = "best") + +################################################################### +## Internal down-sampling + +set.seed(1537) +rf_down_int <- train(Class ~ ., + data = training, + method = "rf", + metric = "ROC", + strata = training$Class, + sampsize = rep(sum(training$Class == "event"), 2), + ntree = 1000, + tuneGrid = mtry_grid, + trControl = ctrl) + +ggplot(rf_mod$results, aes(x = mtry, y = ROC)) + geom_point() + geom_line() + + geom_point(data = rf_down_int$results, aes(x = mtry, y = ROC), col = mod_cols[2]) + + geom_line(data = rf_down_int$results, aes(x = mtry, y = ROC), col = mod_cols[2]) + + theme_bw() + + xlab("#Randomly Selected Predictors") + + ylab("ROC (Repeated Cross-Validation)") + +################################################################### +## External down-sampling + +ctrl$sampling <- "down" +set.seed(1537) +rf_down <- train(Class ~ ., + data = training, + method = "rf", + metric = "ROC", + tuneGrid = mtry_grid, + ntree = 1000, + trControl = ctrl) + +geom_point(data = rf_down$results, aes(x = mtry, y = ROC), col = mod_cols[1]) + + geom_line(data = rf_down$results, aes(x = mtry, y = ROC), col = mod_cols[1]) + + theme_bw() + + xlab("#Randomly Selected Predictors") + + ylab("ROC (Repeated Cross-Validation)") + +################################################################### +## Up-sampling + +ctrl$sampling <- "up" +set.seed(1537) +rf_up <- train(Class ~ ., + data = training, + method = "rf", + tuneGrid = mtry_grid, + ntree = 1000, + metric = "ROC", + trControl = ctrl) + +ggplot(rf_mod$results, aes(x = mtry, y = ROC)) + geom_point() + geom_line() + + geom_point(data = rf_up$results, aes(x = mtry, y = ROC), col = mod_cols[3]) + + geom_line(data = rf_up$results, aes(x = mtry, y = ROC), col = mod_cols[3]) + + theme_bw() + + xlab("#Randomly Selected Predictors") + + ylab("ROC (Repeated Cross-Validation)") + +################################################################### +## Up-sampling done **wrong** + +ctrl2 <- trainControl(method = "repeatedcv", + repeats = 5, + classProbs = TRUE, + savePredictions = TRUE, + summaryFunction = twoClassSummary) +upped <- upSample(x = training[, -1], y = training$Class) + +set.seed(1537) +rf_wrong <- train(Class ~ ., + data = upped, + method = "rf", + tuneGrid = mtry_grid, + ntree = 1000, + metric = "ROC", + trControl = ctrl2) + +ggplot(rf_mod$results, aes(x = mtry, y = ROC)) + geom_point() + geom_line() + + geom_point(data = rf_wrong$results, aes(x = mtry, y = ROC), col = mod_cols[3]) + + geom_line(data = rf_wrong$results, aes(x = mtry, y = ROC), col = mod_cols[3]) + + theme_bw() + + xlab("#Randomly Selected Predictors") + + ylab("ROC (Repeated Cross-Validation)") + +################################################################### +## SMOTE + +ctrl$sampling <- "smote" +set.seed(1537) +rf_smote <- train(Class ~ ., + data = training, + method = "rf", + tuneGrid = mtry_grid, + ntree = 1000, + metric = "ROC", + trControl = ctrl) + +ggplot(rf_mod$results, aes(x = mtry, y = ROC)) + geom_point() + geom_line() + + geom_point(data = rf_smote$results, aes(x = mtry, y = ROC), col = mod_cols[4]) + + geom_line(data = rf_smote$results, aes(x = mtry, y = ROC), col = mod_cols[4]) + + theme_bw() + + xlab("#Randomly Selected Predictors") + + ylab("ROC (Repeated Cross-Validation)") + +################################################################### +## Make code to measure performance for cost-sensitive learning + + +fourStats <- function (data, lev = levels(data$obs), model = NULL) { + accKapp <- postResample(data[, "pred"], data[, "obs"]) + out <- c(accKapp, + sensitivity(data[, "pred"], data[, "obs"], lev[1]), + specificity(data[, "pred"], data[, "obs"], lev[2])) + names(out)[3:4] <- c("Sens", "Spec") + out +} + +ctrl_cost <- trainControl(method = "repeatedcv", + repeats = 5, + classProbs = FALSE, + savePredictions = TRUE, + summaryFunction = fourStats) + +################################################################### +## Setup a custom tuning grid by first fitting a rpart model and +## getting the unique Cp values + +rpart_init <- rpart(Class ~ ., data = training, cp = 0)$cptable + +cost_grid <- expand.grid(cp = rpart_init[, "CP"], + Cost = 1:5) +set.seed(1537) +rpart_costs <- train(Class ~ ., data = training, + method = "rpartCost", + tuneGrid = cost_grid, + metric = "Kappa", + trControl = ctrl_cost) + +ggplot(rpart_costs) + + scale_x_log10(breaks = 10^pretty(log10(rpart_costs$results$cp), n = 5)) + + theme(legend.position = "top") + +################################################################### +## C5.0 with costs + +cost_grid <- expand.grid(trials = 1:3, + winnow = FALSE, + model = "tree", + cost = seq(1, 10, by = .25)) +set.seed(1537) +c5_costs <- train(Class ~ ., data = training, + method = "C5.0Cost", + tuneGrid = cost_grid, + metric = "Kappa", + trControl = ctrl_cost) + +c5_costs_res <- subset(c5_costs$results, trials <= 3) +c5_costs_res$trials <- factor(c5_costs_res$trials) + + +ggplot(c5_costs_res, aes(x = cost, y = Kappa, group = trials)) + + geom_point(aes(color = trials)) + + geom_line(aes(color = trials)) + + ylab("Kappa (Repeated Cross-Validation)")+ + theme(legend.position = "top") +