In [None]:
library('caret')
library('gbm')
library('tidyverse')

x_train = read.delim('Train_call.txt', header = TRUE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "")
y_train = read.delim('Train_clinical.txt', header = TRUE, sep = "\t", quote = "\"", dec = ".", fill = TRUE, comment.char = "", row.names = 1)

x_train = t(x_train)
data <- merge(y_train, x_train[5:nrow(x_train),], by = 'row.names')
row.names(data) <- data$Row.names
data$Row.names <- NULL
set.seed(123)
overall_accuracies = NULL
# outer loop
for (j in seq(1,100)) {
  start_time <- Sys.time()
  msk <- createDataPartition(data[,'Subgroup'], p = 0.8, list = FALSE, times = 1)
  # split the data into training and test set
  dataTrain  <- data[msk, ] # 81 instances
  dataTest <- data[-msk, ]  # 19 instances
  
  rocVarImp <- filterVarImp(dataTrain[, -which(names(dataTrain) == "Subgroup")],
                            factor(dataTrain[, 'Subgroup']) , nonpara = FALSE)
  
  rocVarImpSum <- apply(rocVarImp, 1, sum)
  rocVarImpSum_sorted <- sort(rocVarImpSum, decreasing = TRUE)
  sorted_idx <- strtoi(substring(names(rocVarImpSum_sorted), 2))
  sorted_idx <- paste0('V', sorted_idx)
  write.csv(sorted_idx, file = paste0("random_forest/split", j, ".csv"))
  
  # ---------------------------------------------------------------------------------------
  train_control <- trainControl(method="repeatedcv",
                                number=10,
                                repeats=3,
                                search = "random")
  methods <- c('rf')
  i = 0
  res = NULL # stores results of all models of this split
  
  for (n in seq(56, nrow(dataTrain)+20, 1)) {
    for (method in methods) {
      i = i + 1
      temp_data <- dataTrain[,c('Subgroup', sorted_idx[1:n])] # first n features are used for training
      rf_default <- train(Subgroup ~., data=temp_data, method=method, metric='Accuracy', trControl=train_control)
      res = bind_rows(res, as_tibble(c(rf_default$results[which.max(rf_default$results[,'Accuracy']),], list(split = j, predictors = n, method = 'rf'))))
    }
    
  }
  write.csv(res , file = paste0("random_forest/results_split", j, ".csv"))

  optimal_params <- as.data.frame(res[which.max(as.numeric(unlist(res['Accuracy']))), c('mtry', 'predictors',
                                                                                        'method')])
  print(optimal_params)
  # predictors = features
  
  train_control <- trainControl(method="none")
  tuneGrid = expand.grid(.mtry = optimal_params$mtry)
  
  
  temp_data <- dataTrain[,c('Subgroup', sorted_idx[1:optimal_params[,'predictors']])]
  model2 <- train(Subgroup ~., data = temp_data, method = optimal_params$method, trControl = train_control, 
                  tuneGrid = tuneGrid)
  print(model2)
  
  y_pred = predict(model2, newdata = dataTest[,sorted_idx[1:optimal_params[,'predictors']]])
  
  overall_accuracies = c(overall_accuracies, 
                         confusionMatrix(data = y_pred, as.factor(dataTest[, 'Subgroup']))$overall[['Accuracy']])
  
  ggplot() + 
    geom_line(data = res, aes(x = predictors, y = Accuracy, group = method, color=method)) +
    ylab(label="Accuracy") +
    xlab("Predictors") + 
    ggtitle("Optimized models using random grid search and 10-fold CV\nfor different number of significant predictors in descending order.") + 
    labs(color="Methods")
  ggsave(paste0("random_forest/training_s1_split", j, ".png"))
  end_time <- Sys.time()
  print(end_time - start_time)
  print(paste0('Split ', j , ' done.'))
}




Loading required package: lattice

Loading required package: ggplot2



Resampling: None 
Saving 6.67 x 6.67 in image

Time difference of 9.443095 mins
[1] "Split 29 done."
  mtry predictors method
1   47         58     rf
Random Forest 

81 samples
58 predictors
 3 classes: 'HER2+', 'HR+', 'Triple Neg' 

No pre-processing
Resampling: None 
Saving 6.67 x 6.67 in image

Time difference of 9.27209 mins
[1] "Split 30 done."
  mtry predictors method
1   54         88     rf
Random Forest 

81 samples
88 predictors
 3 classes: 'HER2+', 'HR+', 'Triple Neg' 

No pre-processing
Resampling: None 
Saving 6.67 x 6.67 in image

Time difference of 9.740183 mins
[1] "Split 31 done."
  mtry predictors method
1   21         66     rf
Random Forest 

81 samples
66 predictors
 3 classes: 'HER2+', 'HR+', 'Triple Neg' 

No pre-processing
Resampling: None 
Saving 6.67 x 6.67 in image

Time difference of 9.519366 mins
[1] "Split 32 done."
  mtry predictors method
1   31         98     rf
Random Forest 

81 samples
98 predictors
 3 classes: 'HER2+', 'HR+', 'Triple Neg' 

No pre-

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=39982db8-5b4c-4eff-a4a1-5a43e95600e9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>