-
Notifications
You must be signed in to change notification settings - Fork 4
pastebin
Toby Dylan Hocking edited this page Oct 23, 2017
·
1 revision
tdhock@recycled:~/R/PeakSegPipeline/R(master*)$ R -e PeakSegPipeline::problem.train | grep reg.type
reg.type = ifelse(nrow(features) < 20, "1sd", "min"))
tdhock@recycled:~/R/PeakSegPipeline/R(master*)$
tdhock@recycled:~/R/PeakSegPipeline/R(master*)$ R -e PeakSegPipeline::problem.train
R version 3.4.1 (2017-06-30) -- "Single Candle"
Copyright (C) 2017 The R Foundation for Statistical Computing
Platform: i686-pc-linux-gnu (32-bit)
R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.
Natural language support but running in an English locale
R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.
Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.
> PeakSegPipeline::problem.train
function (data.dir.str)
{
status <- too.lo <- too.hi <- penalty <- bases <- chromEnd <- chromStart <- upper.lim <- lower.lim <- upper.bases <- lower.bases <- ..density.. <- prob <- hjust <- NULL
data.dir <- normalizePath(data.dir.str, mustWork = TRUE)
samples.dir <- file.path(data.dir, "samples")
model.RData <- file.path(data.dir, "model.RData")
glob.str <- file.path(samples.dir, "*", "*", "problems",
"*", "target.tsv")
cat("Searching for", glob.str, "files for training.\\n")
target.tsv.vec <- Sys.glob(glob.str)
cat("Found", length(target.tsv.vec), "target.tsv files for training.\\n")
features.list <- list()
targets.list <- list()
for (target.tsv.i in seq_along(target.tsv.vec)) {
target.tsv <- target.tsv.vec[[target.tsv.i]]
problem.dir <- dirname(target.tsv)
features.tsv <- file.path(problem.dir, "features.tsv")
if (!file.exists(features.tsv)) {
cat(sprintf("%4d / %4d Computing %s\\n", target.tsv.i,
length(target.tsv.vec), features.tsv))
problem.features(problem.dir)
}
features.list[[problem.dir]] <- fread(features.tsv)
targets.list[[problem.dir]] <- scan(target.tsv, quiet = TRUE)
}
features <- as.matrix(do.call(rbind, features.list))
targets <- do.call(rbind, targets.list)
set.seed(1)
model <- if (nrow(features) < 10) {
some.features <- features[, c("log.quartile.100%", "log.data")]
cat("Feature matrix:\\n")
print(some.features)
cat("Target matrix:\\n")
print(unname(targets))
penaltyLearning::IntervalRegressionUnregularized(some.features,
targets)
}
else {
penaltyLearning::IntervalRegressionCV(features, targets,
verbose = 0, initial.regularization = 1e-04, min.observations = nrow(features),
reg.type = ifelse(nrow(features) < 20, "1sd", "min"))
}
model$train.mean.vec <- colMeans(features)
cat("Learned regularization parameter and weights:\\n")
print(model$pred.param.mat)
pred.log.penalty <- as.numeric(model$predict(features))
pred.dt <- data.table(problem.dir = dirname(target.tsv.vec),
too.lo = as.logical(pred.log.penalty < targets[, 1]),
lower.limit = targets[, 1], pred.log.penalty, upper.limit = targets[,
2], too.hi = as.logical(targets[, 2] < pred.log.penalty))
pred.dt[, `:=`(status, ifelse(too.lo, "low", ifelse(too.hi,
"high", "correct")))]
correct.targets <- pred.dt[status == "correct", ]
correct.peaks <- correct.targets[!grepl("Input", problem.dir),
{
target_models.tsv <- file.path(problem.dir, "target_models.tsv")
target.models <- fread(target_models.tsv)
closest <- target.models[which.min(abs(log(penalty) -
pred.log.penalty)), ]
coverage.bedGraph <- file.path(problem.dir, "coverage.bedGraph")
segments.bed <- paste0(coverage.bedGraph, "_penalty=",
closest$penalty, "_segments.bed")
segs <- fread(segments.bed)
setnames(segs, c("chrom", "chromStart", "chromEnd",
"status", "mean"))
segs[status == "peak", ]
}, by = problem.dir]
correct.peaks[, `:=`(bases, chromEnd - chromStart)]
correct.peaks[, `:=`(log10.bases, log10(bases))]
size.model <- correct.peaks[, list(mean = mean(log10.bases),
sd = sd(log10.bases))]
times <- 1
size.model[, `:=`(upper.lim, mean + times * sd)]
size.model[, `:=`(lower.lim, mean - times * sd)]
size.model[, `:=`(upper.bases, 10^(upper.lim))]
size.model[, `:=`(lower.bases, 10^(lower.lim))]
cat("Train errors:\\n")
print(pred.dt[, list(targets = .N), by = status])
model$train.feature.ranges <- apply(features[, model$pred.feature.names,
drop = FALSE], 2, range)
log10.bases.grid <- correct.peaks[, seq(min(log10.bases),
max(log10.bases), l = 100)]
normal.dens <- data.table(log10.bases = log10.bases.grid,
prob = size.model[, dnorm(log10.bases.grid, mean, sd)])
base.labels <- size.model[, {
log10.bases <- c(lower.lim, mean, upper.lim)
data.table(log10.bases, hjust = c(1, 0.5, 0), label = scales::comma(round(10^log10.bases)))
}]
if (FALSE) {
size.plot <- ggplot() + geom_histogram(aes(log10.bases,
..density..), data = correct.peaks) + geom_vline(aes(xintercept = mean),
data = size.model, size = 1, color = "red") + penaltyLearning::geom_tallrect(aes(xmin = lower.lim,
xmax = upper.lim), data = size.model, fill = "red") +
geom_line(aes(log10.bases, prob), data = normal.dens,
color = "red", size = 1) + geom_text(aes(log10.bases,
0, label = label, hjust = hjust), data = base.labels,
vjust = 1)
}
cat("Writing model to", model.RData, "\\n")
save(model, features, targets, size.model, correct.peaks,
file = model.RData)
}
<environment: namespace:PeakSegPipeline>
>
>
tdhock@recycled:~/R/PeakSegPipeline/R(master*)$