Skip to content

Commit

Permalink
refactor: updating error checking format and adding comments to code
Browse files Browse the repository at this point in the history
  • Loading branch information
helen-zhu committed Jan 30, 2024
1 parent 04f7230 commit 12224d6
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 8 deletions.
19 changes: 18 additions & 1 deletion R/cosine.similarity.cutoff.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' * 'cauchy'
#' * 'logis'
#'
#' @param quality.scores The output of accumulate.zscores
#' @param quality.scores A dataframe with columns 'Sum' (of scores) and 'Sample', i.e. the output of accumulate.zscores
#' @param no.simulations The number of datasets to simulate
#' @param trim.factor What fraction of values of each to trim to get parameters without using extremes
#' @param alpha.significant Alpha value for significance
Expand All @@ -27,20 +27,29 @@ cosine.similarity.cutoff <- function(
alpha.significant = 0.05
) {

# Error checking
accumulate.zscores.output.check(quality.scores);
check.valid.no.simulations(no.simulations);
distribution <- match.arg(distribution);
check.valid.trim.factor(trim.factor);
check.valid.alpha.significant(alpha.significant);

# Defining variables
no.samples <- nrow(quality.scores);
trim.num <- round(no.samples * trim.factor);

# Trimming quality scores
quality.scores.trimmed <- quality.scores[-((no.samples - trim.num + 1):no.samples), ];
quality.scores.trimmed <- quality.scores.trimmed[-(1:trim.num), ];

# Fitting distribution and extracting parameters
fit <- fitdistrplus::fitdist(-quality.scores.trimmed$Sum, distribution);
p <- ppoints(-quality.scores$Sum);
args <- as.list(fit$estimate);
args.q <- c(args, list('p' = p));
args.r <- c(args, list('n' = no.samples));

# Simulating data
simulated.distributions <- matrix(
data = NA,
nrow = no.simulations,
Expand All @@ -54,13 +63,17 @@ cosine.similarity.cutoff <- function(
);
}

# Initiating empty vector for cosine similarity
cos.similarity.nulldist <- numeric(no.simulations);

# Calculating a theoretical quantile set
theoretical.data.quantile <- do.call(
what = paste0('q', distribution),
args = args.q
);

# Comparing each simulated dataset to the theoretical
# quantile set using cosine similarity
for (i in 1:no.simulations) {
simulated.data.quantile <- quantile(
x = simulated.distributions[i, ],
Expand All @@ -73,15 +86,19 @@ cosine.similarity.cutoff <- function(
);
}

# Calculating quantiles of cosine similarity and
# determining threshold of cosine similarity required to achieve significance
alpha.cutoff.cos.sim <- quantile(
x = cos.similarity.nulldist,
prob = c(alpha.significant)
);

# Calculating cutoff and nominating outliers
cutoff <- calculate.cutoff(max(theoretical.data.quantile), alpha.cutoff.cos.sim);

no.outliers <- sum(-quality.scores$Sum > cutoff);

# Returning results
results <- list(
"cutoff" = cutoff,
"no.outliers" = no.outliers,
Expand Down
21 changes: 20 additions & 1 deletion R/cosine.similarity.iterative.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
#' * 'cauchy'
#' * 'logis'
#'
#' @param quality.scores The output of accumulate.zscores
#' @param quality.scores A dataframe with columns 'Sum' (of scores) and 'Sample', i.e. the output of accumulate.zscores
#' @param no.simulations The number of datasets to simulate
#' @param trim.factor What fraction of values of each to trim to get parameters without using extremes
#' @param alpha.significant Alpha value for significance
Expand All @@ -27,45 +27,59 @@ cosine.similarity.iterative <- function(
alpha.significant = 0.05
) {

# Error checking
accumulate.zscores.output.check(quality.scores);
check.valid.no.simulations(no.simulations);
distribution <- match.arg(distribution);
check.valid.trim.factor(trim.factor);
check.valid.alpha.significant(alpha.significant);

# Initializing variables
significant.pvalue <- TRUE;
no.outliers <- 0;
outlier.labels <- c();

# Taking the negative of scores
observed.data <- quality.scores[order(quality.scores$Sum, decreasing = TRUE), ];
observed.data$Sum <- -observed.data$Sum

while (significant.pvalue) {

# Trimming data
no.samples <- nrow(observed.data);
trim.num <- round(no.samples * trim.factor);

observed.data.trimmed <- observed.data[-((no.samples - trim.num + 1):no.samples), ];
observed.data.trimmed <- observed.data.trimmed[-(1:trim.num), ];

# Fitting distribution and extracting parameters
fit <- fitdistrplus::fitdist(observed.data.trimmed$Sum, distribution);

p <- ppoints(observed.data$Sum);
args <- as.list(fit$estimate);
args.q <- c(args, list('p' = p));
args.r <- c(args, list('n' = no.samples));

# Quantiles of observed data
observed.data.quantile <- quantile(
x = observed.data$Sum,
prob = p
);

# Quantiles of theoretical data (based on fitted data)
theoretical.data.quantile <- do.call(
what = paste0('q', distribution),
args = args.q
);

# Calculating the cosine similarity between the max
# observed and theoretical quantile
cos.similarity.obs <- lsa::cosine(
x = c(max(observed.data.quantile), max(theoretical.data.quantile)),
y = c(1, 1)
);

# Simulating datasets from the theoretical distribution
simulated.distributions <- matrix(
data = NA,
nrow = no.simulations,
Expand All @@ -79,8 +93,11 @@ cosine.similarity.iterative <- function(
);
}

# Initializing cosine similarity vector
cos.similarity.nulldist <- numeric(no.simulations);

# Calculating the cosine similarity between the max simulated data point quantile
# and the max theoretical quantile
for (i in 1:no.simulations) {
simulated.data.quantile <- quantile(
x = simulated.distributions[i, ],
Expand All @@ -93,10 +110,12 @@ cosine.similarity.iterative <- function(
);
}

# Determining the number of simulated cosine similarities that are less than observed
simulated.cos.sim.smaller <- sum(cos.similarity.nulldist < cos.similarity.obs[1, 1]);
pvalue <- simulated.cos.sim.smaller / no.simulations;
significant.pvalue <- pvalue < alpha.significant;

# Updating the number of outliers
if (significant.pvalue) {
no.outliers <- no.outliers + 1;
outlier.labels <- append(
Expand Down
12 changes: 6 additions & 6 deletions R/fit.and.evaluate.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,23 +23,22 @@ fit.and.evaluate <- function(

# Error checking
distributions <- match.arg(distributions, several.ok = TRUE);
if(!is.numeric(trim.factor) || (trim.factor > 0.5) || (trim.factor < 0)) {
stop("trim.factor must be a numeric between 0 and 0.5");
}
if(!("Sum" %in% colnames(quality.scores)) || !is.numeric(quality.scores$Sum)){
stop("quality scores do not have a valid column for aggregated zscores");
}
accumulate.zscores.output.check(quality.scores);
check.valid.trim.factor(trim.factor);

# Initializing variables
no.distributions <- length(distributions);
KS.rejected <- logical(no.distributions);
BIC.value <- numeric(no.distributions);

no.samples <- nrow(quality.scores);
trim.num <- round(no.samples * trim.factor);

# Trimming quality scores
quality.scores <- quality.scores[- ((no.samples - trim.num + 1):no.samples), ];
quality.scores <- quality.scores[- (1:trim.num), ];

# Fitting distributions
for (i in seq_len(no.distributions)){

fit <- fitdistrplus::fitdist(-quality.scores$Sum, distributions[i]);
Expand All @@ -49,6 +48,7 @@ fit.and.evaluate <- function(
BIC.value[i] <- fit$bic;
}

# Compiling results
results.df <- data.frame(
distribution = distributions,
KS.rejected = KS.rejected,
Expand Down

0 comments on commit 12224d6

Please sign in to comment.