In [6]:
############################################
# 环境检查
############################################

# 设置默认镜像源
options(repos = structure(c(
  CRAN = "https://mirrors.tuna.tsinghua.edu.cn/CRAN/"
)))

if (!require("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

# 检查并安装 DBI 包
if (!require("DBI", quietly = TRUE)) {
  install.packages("DBI")
}

# 检查并安装 GO.db 包
if (!require("GO.db", quietly = TRUE)) {
  BiocManager::install("GO.db")
}

# 检查并安装 clusterProfiler 包
if (!require("clusterProfiler", quietly = TRUE)) {
  BiocManager::install("clusterProfiler")
}

# 检查并安装 munsell 包
if (!require("munsell", quietly = TRUE)) {
  install.packages("munsell")
}

# 检查并安装 stringi 包
if (!require("stringi", quietly = TRUE)) {
  install.packages("stringi")
}

# 检查并安装 farver 包
if (!require("farver", quietly = TRUE)) {
  install.packages("farver")
}

# 检查并安装 viridis 包
if (!require("viridis", quietly = TRUE)) {
  install.packages("viridis")
}

# 检查并安装 RColorBrewer 包
if (!require("RColorBrewer", quietly = TRUE)) {
  install.packages("RColorBrewer")
}

# # 检查并安装 tidyverse 包（仅有ggplot2也可以）
# if (!require("tidyverse", quietly = TRUE)) {
#   install.packages("tidyverse")
# }

# 检查并安装 ggplot2 包
if (!require("ggplot2", quietly = TRUE)) {
  install.packages("ggplot2")
}

# 检查并安装基因注释数据库
# org.Hs.eg.db 人类基因注释数据库
# org.Mm.eg.db 小鼠基因注释数据库
if (!require("org.Mm.eg.db", quietly = TRUE)) {
  BiocManager::install("org.Mm.eg.db")
}
# 安装biomaRt，用于将ensembl gene id转gene_ids
if (!require("biomaRt", quietly = TRUE)) {
  BiocManager::install("biomaRt")
}





In [1]:
############################################
# 加载包
############################################
library(tidyverse)
library(clusterProfiler)
library(org.Mm.eg.db)
library(biomaRt)


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.1     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


clusterProfiler v4.10.0  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/

If you use clusterProfiler in published research, please cite:
T Wu, E Hu, 

In [9]:
############################################
# 自定义参数
############################################

# 报告所在文件夹路径
# report_dir <- "report/治疗vs对照/"
report_dir <- "report/治疗vs野生/"
# DMR基因文件名称
DMR_File <- paste0(report_dir, "DMR_genes.tsv")
# 输出指定GO通路集的柱状图（若不需要则直接注释掉）
# pw2show <- c("GO:0007015", "GO:0007264", "GO:1902903", "GO:0032970")


############################################
# GO/KEGG分析
# 参考文档地址：https://lishensuo.github.io/posts/bioinfo/056clusterprofiler%E5%8C%85%E5%AF%8C%E9%9B%86%E5%88%86%E6%9E%90%E4%B8%8E%E5%8F%AF%E8%A7%86%E5%8C%96/#2gsea%e6%89%93%e5%88%86
############################################

# 检查输出报告的文件夹是否存在
if (!dir.exists(report_dir)) {
  # 如果文件夹不存在，则创建文件夹
  dir.create(report_dir)
}

# 读取DMR导出的表
d <- read.csv(DMR_File, sep = "\t")

region_types <- c("gain", "loss", "all")
# 循环处理不同的 regionType
for (region_type in region_types) {
  if (region_type == "all") {
    # 去重后的gene_id列
    gene_ids <- unique(d$gene_id)
  } else {
    # 过滤regionType并去重
    gene_ids <- unique(d$gene_id[d$regionType == region_type])
  }

  # go富集分析
  ego <- enrichGO(
    gene          = gene_ids, # 输入基因列表
    keyType       = "ENSEMBL", # 指定基因ID类型为 Ensembl 基因 ID
    OrgDb         = org.Mm.eg.db, # 使用小鼠基因数据库
    ont           = "ALL", # 指定 GO 类别：CC（细胞组分）、BP（生物过程）、MF（分子功能）
    pAdjustMethod = "BH", # 多重假设检验校正方法，使用 Benjamini-Hochberg 方法
    pvalueCutoff  = 0.01, # p 值阈值
    qvalueCutoff  = 0.05, # q 值阈值
    readable      = TRUE # 是否将结果转换为可读的基因符号
  )
  # 导出结果为 TSV 文件
  write.table(as.data.frame(ego),
    file = paste0(report_dir, "GO富集-", region_type, ".tsv"),
    sep = "\t", row.names = FALSE, quote = FALSE
  )
  # 绘图
  barplot(
    ego,
    split = "ONTOLOGY",
    showCategory = 12,
    label_format = 50,
  ) + facet_grid(ONTOLOGY ~ ., scale = "free")
  ggsave(
    paste0(report_dir, "GO富集-", region_type, ".png"),
    width = 8, height = 6
  )

  # 指定GO通路
  if (exists("pw2show") && !is.null(pw2show)) {
    barplot(ego, showCategory = ego@result$Description[
      which(rownames(ego@result) %in% pw2show)
    ])
    ggsave(
      paste0(report_dir, "GO富集(指定通路)-", region_type, ".png"),
      width = 8, height = 6
    )
  }


  # 使用 clusterProfiler 包中的 simplify 函数对富集分析结果进行去冗余处理
  ego_sim <- clusterProfiler::simplify(
    ego, # 输入的富集分析结果对象
    cutoff = 0.7, # 去冗余的阈值。相似度大于这个值的 GO term 将被合并
    measure = "Wang", # 相似度计算方法，这里指定为 "Wang"。Wang 方法基于信息内容来计算 GO term 的相似度
    by = "p.adjust", # 按哪个字段进行去冗余操作。表示将相似度高的 GO term 合并时，保留调整后的 p 值最低的 GO term
    select_fun = min # 选择保留 GO term 的标准，这里指定为 min。表示选择 p 值最小的 GO term
  )
  # 导出结果为 TSV 文件
  write.table(
    as.data.frame(ego_sim),
    file = paste0(report_dir, "GO富集(去冗余)-", region_type, ".tsv"),
    sep = "\t", row.names = FALSE, quote = FALSE
  )
  # 绘图
  barplot(
    ego_sim,
    split = "ONTOLOGY",
    showCategory = 12,
    label_format = 50,
  ) + facet_grid(ONTOLOGY ~ ., scale = "free")
  ggsave(
    paste0(report_dir, "GO富集(去冗余)-", region_type, ".png"),
    width = 8, height = 6
  )

  # 从 Ensembl Gene ID 转换到 Entrez Gene ID
  # mmusculus_gene_ensembl 小鼠数据集,hsapiens_gene_ensembl 人类数据集
  ensembl <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
  ensembl_gene <- getBM(
    attributes = c("ensembl_gene_id", "entrezgene_id"),
    filters = "ensembl_gene_id",
    values = gene_ids,
    mart = ensembl
  )
  entrezgene_ids <- ensembl_gene$entrezgene_id

  # KEGG
  ekg <- enrichKEGG(
    gene = entrezgene_ids, # 输入的差异表达基因。
    keyType = "kegg", # one of "kegg", 'ncbi-geneid', 'ncbi-proteinid', 'uniprot'
    organism = "mmu", # 物种标识符
    pvalueCutoff = 0.05 # p 值的阈值，用于筛选富集的 KEGG 路径。
  )
  # 将 ekg 结果设置为可读格式，以便于更好地解释结果
  ekg <- setReadable(ekg, OrgDb = org.Mm.eg.db, keyType = "ENTREZID")

  # 导出结果为 TSV 文件
  write.table(
    as.data.frame(ekg),
    file = paste0(report_dir, "KEGG富集-", region_type, ".tsv"),
    sep = "\t", row.names = FALSE, quote = FALSE
  )
  # 绘制气泡图。颜色映射P值，大小映射交集基因数(差异基因与通路基因集)，横轴表示比例(count/geneset)
  dotplot(ego, showCategory = 20, label_format = 50)
  ggsave(
    paste0(report_dir, "KEGG富集-", region_type, ".png"),
    width = 8, height = 6
  )
}


Reading KEGG annotation online: "https://rest.kegg.jp/link/mmu/pathway"...

Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/mmu"...

