Skip to content

Commit

Permalink
adding preharmonization function for predictdb weights
Browse files Browse the repository at this point in the history
  • Loading branch information
wesleycrouse committed Sep 29, 2023
1 parent 8c9fac2 commit 47c0e3b
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 3 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export(ctwas_summarize_parameters)
export(impute_expr)
export(impute_expr_z)
export(predict.susie)
export(preharmonize_wgt_ld)
export(preharmonize_z_ld)
export(susie)
export(susie_get_cs)
Expand Down
160 changes: 160 additions & 0 deletions R/ctwas_harmonize_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -315,3 +315,163 @@ preharmonize_z_ld <- function (z_snp, ld_R_dir, outputdir = getwd(), outname = N

return(list(z_snp = z_snp))
}

#' Harmonize PredictDB weights and LD reference
#'
#' @param weight a string, pointing to a directory with the fusion/twas format of weights, or a .db file in predictdb format.
#' A vector of multiple sets of weights in PredictDB format can also be specified; genes will have their filename appended
#' to their gene name to ensure IDs are unique.
#'
#' @param LD_R_dir a string, pointing to a directory containing all LD matrix files and variant information. Expects .RDS files which contain LD correlation matrices for a region/block.
#' For each RDS file, a file with same base name but ended with .Rvar needs to be present in the same folder. the .Rvar file has 5 required columns: "chrom", "id", "pos", "alt", "ref".
#' If using PredictDB format weights and \code{scale_by_ld_variance=T}, a 6th column is also required: "variance", which is the variance of the each SNP.
#' The order of rows needs to match the order of rows in .RDS file.
#'
#' @param outputdir a string, the directory to store output
#'
#' @param outname a string, the output name
#'
#' @param strand_ambig_action_wgt the action to take to harmonize strand ambiguous variants (A/T, G/C) between
#' the weights and LD reference. "drop" removes the ambiguous variant from the prediction models. "none" treats the variant
#' as unambiguous, flipping the weights to match the LD reference and then taking no additional action. "recover" uses a procedure
#' to recover strand ambiguous variants. This procedure compares correlations between variants in the
#' LD reference and prediction models, and it can only be used with PredictDB format prediction models, which include this
#' information.
#'
#' @importFrom logging addHandler loginfo
#' @importFrom tools file_ext
#'
#' @export
#'
preharmonize_wgt_ld <- function (weight,
ld_R_dir,
outputdir = getwd(),
outname,
strand_ambig_action_wgt = c("drop", "none", "recover")){

strand_ambig_action_wgt <- match.arg(strand_ambig_action_wgt)

dir.create(outputdir, showWarnings = F)

#read the PredictDB weights
sqlite <- RSQLite::dbDriver("SQLite")
db = RSQLite::dbConnect(sqlite, weight)
query <- function(...) RSQLite::dbGetQuery(db, ...)
weight_table <- query("select * from weights")
extra_table <- query("select * from extra")
RSQLite::dbDisconnect(db)

gnames <- unique(weight_table$gene)
loginfo("Number of genes with weights provided: %s", length(gnames))

#load ld information and subset to variants in weights
ld_Rfs <- ctwas:::write_ld_Rf(ld_R_dir, outname = outname, outputdir = outputdir)

ld_Rinfo <- lapply(ld_Rfs, data.table::fread, header=T)
ld_Rinfo <- as.data.frame(do.call(rbind, ld_Rinfo))

ld_snpinfo <- lapply(ld_Rfs, ctwas:::read_ld_Rvar)
ld_snpinfo <- as.data.frame(do.call(rbind, ld_snpinfo))
ld_snpinfo <- ld_snpinfo[ld_snpinfo$id %in% weight_table$rsid,]

loginfo("Flipping weights to match LD reference")

if (strand_ambig_action_wgt=="recover"){
R_wgt_all <- read.table(gzfile(paste0(file_path_sans_ext(weight), ".txt.gz")), header=T) #load covariances for variants in each gene (accompanies .db file)
loginfo("Harmonizing strand ambiguous weights using correlations with unambiguous variants")
}

weight_table_harmonized <- list()

for (i in 1:length(gnames)){

if (i %% 1000 == 0){
loginfo("Current gene: %s", i)
}

gname <- gnames[i]

wgt <- weight_table[weight_table$gene==gname,]
wgt.matrix <- as.matrix(wgt[, "weight", drop = F])

rsid_varID <- wgt[,c("rsid", "varID")]

rownames(wgt.matrix) <- wgt$rsid
chrpos <- do.call(rbind, strsplit(wgt$varID, "_"))

snps <- data.frame(gsub("chr", "", chrpos[, 1]), wgt$rsid,
"0", chrpos[, 2], wgt$eff_allele, wgt$ref_allele,
stringsAsFactors = F)
colnames(snps) <- c("chrom", "id", "cm", "pos", "alt", "ref")
snps$chrom <- as.integer(snps$chrom)
snps$pos <- as.integer(snps$pos)

chrom <- unique(snps$chrom)
ld_Rinfo_chrom <- ld_Rinfo[ld_Rinfo$chrom==chrom,]

if (strand_ambig_action_wgt=="recover"){
#subset R_wgt_all to current weight
R_wgt <- R_wgt_all[R_wgt_all$GENE == gname,]

#convert covariance to correlation
R_wgt_stdev <- R_wgt[R_wgt$RSID1==R_wgt$RSID2,]
R_wgt_stdev <- setNames(sqrt(R_wgt_stdev$VALUE), R_wgt_stdev$RSID1)
R_wgt$VALUE <- R_wgt$VALUE/(R_wgt_stdev[R_wgt$RSID1]*R_wgt_stdev[R_wgt$RSID2])

#discard variances
R_wgt <- R_wgt[R_wgt$RSID1!=R_wgt$RSID2,]

#fix edge case where variance=0; treat correlations with these variants as uninformative (=0) for harmonization
R_wgt$VALUE[is.nan(R_wgt$VALUE)] <- 0
} else {
R_wgt <- NULL
}

w <- ctwas:::harmonize_wgt_ld(wgt.matrix,
snps,
ld_snpinfo,
strand_ambig_action=strand_ambig_action_wgt,
ld_Rinfo=ld_Rinfo_chrom,
R_wgt=R_wgt,
wgt=wgt)

wgt.matrix <- w[["wgt"]]
snps <- w[["snps"]]

wgt.matrix <- wgt.matrix[abs(wgt.matrix[, "weight"]) > 0, , drop = F]
wgt.matrix <- wgt.matrix[complete.cases(wgt.matrix),, drop = F]

snpnames <- intersect(rownames(wgt.matrix), ld_snpinfo$id)

wgt.idx <- match(snpnames, rownames(wgt.matrix))
wgt <- wgt.matrix[wgt.idx, "weight", drop = F]

snps.idx <- match(snpnames, snps$id)
snps <- snps[snps.idx,]

if (length(snpnames)>0){
weight_table_current <- data.frame(gene=gname,
rsid=snps$id,
varID=rsid_varID$varID[match(snps$id, rsid_varID$rsid)],
ref_allele=snps$ref,
eff_allele=snps$alt,
weight=wgt[,"weight"])

weight_table_harmonized[[gname]] <- weight_table_current
}
}

weight_table_harmonized <- do.call(rbind, weight_table_harmonized)
extra_table <- extra_table[extra_table$gene %in% weight_table_harmonized$gene,]

if (file.exists(paste0(outputdir, outname, ".db"))){
invisible(file.remove(paste0(outputdir, outname, ".db")))
}

db <- RSQLite::dbConnect(sqlite, paste0(outputdir, outname, ".db"))
RSQLite::dbWriteTable(db, "extra", extra_table)
RSQLite::dbWriteTable(db, "weights", weight_table_harmonized)
RSQLite::dbDisconnect(db)

invisible(file.remove(paste0(outputdir, outname, "_ld_R_chr", 1:22, ".txt")))
}
15 changes: 12 additions & 3 deletions man/impute_expr.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions man/preharmonize_wgt_ld.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 47c0e3b

Please sign in to comment.