update tutorials

xinhe-lab · Jun 13, 2024 · 9ac54a8 · 9ac54a8
1 parent 39c536d
commit 9ac54a8
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 43 deletions.
diff --git a/vignettes/preparing_ctwas_input_data.Rmd b/vignettes/preparing_ctwas_input_data.Rmd
@@ -157,7 +157,7 @@ We require a data frame `region_info` containing the region definitions, with th
 
 Here we use the b38 European LDetect blocks, included in the package.
 
-```{r region_info}
+```{r region_info, eval=FALSE}
 region_file <- system.file("extdata/ldetect", "EUR.b38.bed", package = "ctwas")
 region_info <- read.table(region_file, header = TRUE)
 colnames(region_info)[1:3] <- c("chrom", "start", "stop")
@@ -170,7 +170,7 @@ When running with LD, we will also need to add the column: "LD_matrix":
 "LD_matrix" stores the paths to the precomputed LD (R) matrices (`.RDS` files in our precomputed reference LD files) and corresponding variant information (`.Rvar` files). 
 
 The following paths were from the University of Chicago RCC cluster, please replace them with your own paths. 
-```{r add_LD_paths}
+```{r add_LD_paths, eval=FALSE}
 # please change this to your own directory for LD matrices.
 LD_dir <- "/project2/mstephens/wcrouse/UKB_LDR_0.1"
 
@@ -221,7 +221,7 @@ We need a list of variants as a “reference”, with the positions and allele i
 When running with LD, it takes input of `region_info`, which contains paths of LD matrices and variant information for each region, and returns processed `region_info`, `snp_info` and `LD_info`.
 
 In this tutorial, we use chr16 as an example, so we specify `chrom = 16`. In real cTWAS analysis, you should run the entire genome. 
-```{r snp_info_with_LD}
+```{r snp_info_with_LD, eval=FALSE}
 res <- preprocess_region_LD_snp_info(region_info, 
                                      chrom = 16, 
                                      use_LD = TRUE, 
@@ -236,17 +236,6 @@ LD_info <- res$LD_info
 ```
 
 
-Let's look at the `region_info`, `LD_info` and `snp_info`:
-
-```{r}
-head(region_info, 2)
-
-head(LD_info, 2)
-
-str(head(snp_info, 2))
-```
-
-
 
 When running without LD, it takes input of `region_info` and a data frame `ref_snp_info` of variant information from the entire genome, and returns processed `region_info`, `snp_info`.
 We have the lists of reference variant information from all the LD matrices in the genome in [hg38](https://uchicago.box.com/s/t089or92dkovv0epkrjvxq8r9db9ys99) and [hg19](https://uchicago.box.com/s/ufko2gjagcb693dob4khccqubuztb9pz).

diff --git a/vignettes/simple_ctwas_tutorial.Rmd b/vignettes/simple_ctwas_tutorial.Rmd
@@ -91,8 +91,7 @@ ref_snp_info <- data.table::fread(ref_snp_info_file, sep = "\t")
 res <- preprocess_region_LD_snp_info(region_info, 
                                      ref_snp_info = ref_snp_info, 
                                      chrom = example_chrom, 
-                                     use_LD = FALSE, 
-                                     ncore = 6)
+                                     use_LD = FALSE)
 region_info <- res$region_info
 snp_info <- res$snp_info
 
@@ -255,19 +254,33 @@ ctwas_parameters$convergence_plot
 
 ## Interpreting cTWAS results
 
-We need gene annotations from the Ensembl database. We use `EnsDb.Hsapiens.v86` for the example data, please choose the Ensembl database for your specific data. 
-```{r load_ens_db, message=FALSE}
-library(EnsDb.Hsapiens.v86)
-```
+To interpret and visualize the results, we first need to add gene annotations 
+(gene names and gene types) to the finemapping result.
+
+We need a user defined data frame `gene_annot` that should contain the following columns:
+'chrom', 'start', 'end', 'gene_id', 'gene_name' and 'gene_type' for each gene or molecular trait.
 
+If you only used gene expression data, you can use the following function 
+to obtain the gene annotations from the Ensembl database.
+We use `EnsDb.Hsapiens.v86` for the example data, please choose the Ensembl database for your specific data. 
 
-To interpret the results, we add gene names (from Ensembl database) to the finemapping result, and use mid-points to represent gene positions.
-```{r add_gene_names}
-# load gene info database
+```{r get_gene_annot, message=FALSE}
+library(EnsDb.Hsapiens.v86)
 ens_db <- EnsDb.Hsapiens.v86
+finemap_gene_res <- finemap_res[finemap_res$type!="SNP",]
+gene_ids <- unique(sapply(strsplit(finemap_gene_res$id, split = "[|]"), "[[", 1))
+gene_annot <- get_gene_annot_from_ens_db(ens_db, gene_ids)
+```
 
-# add gene names and use gene mid-points to represent gene positions
-finemap_res <- anno_finemap_res(finemap_res, snp_info, ens_db, use_gene_pos = "mid")
+
+We then use `anno_finemap_res()` function to add gene annotations to the finemapping result,
+and use gene mid-points to represent gene positions.
+```{r anno_finemap_res}
+# add gene annotations and use gene mid-points to represent gene positions
+finemap_res <- anno_finemap_res(finemap_res, 
+                                snp_info,
+                                gene_annot,
+                                use_gene_pos = "mid")
 ```
 
 
@@ -280,35 +293,35 @@ head(sig_finemap_res[order(-sig_finemap_res$susie_pip),])
 ```
 
 
-
 When we have multiple contexts (tissues), 
 it is useful to evaluate the combined PIP for each molecular trait across contexts:
 
 ```{r combine_pips}
-# combine PIPs across contexts (tissues)
-combined_gene_pips <- sum_pip_across_contexts(finemap_res)
+# combine gene PIPs by context
+combined_pip_by_context <- combine_gene_pips(finemap_res, 
+                                             by = "context",
+                                             filter_protein_coding_genes = TRUE)
+head(combined_pip_by_context)
 
 # select combined PIP > 0.8
-sig_gene_pips_df <- combined_gene_pips[combined_gene_pips$combined_pip > 0.8, ]
-
-# list genes with combined PIP > 0.8 or top 20 genes with highest combined PIPs
-head(combined_gene_pips, max(nrow(sig_gene_pips_df), 20))
+sig_gene_pips_df <- combined_pip_by_context[combined_pip_by_context$combined_pip > 0.8, ]
+head(sig_gene_pips_df)
 ```
 
 
-
 ## Visualizing cTWAS results
 
-We could make locus plots for regions of interest,
-and zoom in the region by specifying the `locus_range`.
+We could make locus plots for regions of interest. 
+We could zoom in the region by specifying the `locus_range`.
+
 ```{r locus_plot, fig.width=10, fig.height=8}
 make_locusplot(finemap_res,
                region_id = "16_71020125_72901251",
                weights = weights,
                ens_db = ens_db,
-               use_LD = FALSE,
-               pip_thresh = 0.8,
-               locus_range = c(71.6e6,72.4e6))
+               locus_range = c(71.6e6,72.4e6),
+               highlight_pip = 0.8,
+               legend.position = "top")
 ```
 
 

diff --git a/vignettes/summarizing_ctwas_results.Rmd b/vignettes/summarizing_ctwas_results.Rmd
@@ -147,25 +147,22 @@ We could further limit results in credible sets and/or protein coding genes.
 # combine gene PIPs by context
 combined_pip_by_context <- combine_gene_pips(finemap_res, 
                                              by = "context",
-                                             filter_cs = TRUE,
                                              filter_protein_coding_genes = TRUE)
-print(combined_pip_by_context)
+head(combined_pip_by_context)
 
 # combine gene PIPs by type
 combined_pip_by_type <- combine_gene_pips(finemap_res, 
                                           by = "type",
-                                          filter_cs = TRUE,
                                           filter_protein_coding_genes = TRUE)
 
 # combine gene PIPs by group
 combined_pip_by_group <- combine_gene_pips(finemap_res, 
                                             by = "group",
-                                            filter_cs = TRUE,
                                             filter_protein_coding_genes = TRUE)
 
 # select combined PIP > 0.8
 sig_gene_pips_df <- combined_pip_by_context[combined_pip_by_context$combined_pip > 0.8, ]
-print(sig_gene_pips_df)
+head(sig_gene_pips_df)
 ```