<a href="https://colab.research.google.com/github/zqzhu0609/R/blob/main/GSEA_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("clusterProfiler")
BiocManager::install("org.Hs.eg.db")
BiocManager::install("msigdbr")
BiocManager::install("enrichplot")
BiocManager::install("fgsea")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com

Bioconductor version 3.20 (BiocManager 1.30.25), R 4.4.3 (2025-02-28)

Installing package(s) 'BiocVersion', 'clusterProfiler'

also installing the dependencies ‘zlibbioc’, ‘UCSC.utils’, ‘GenomeInfoDbData’, ‘formatR’, ‘XVector’, ‘GenomeInfoDb’, ‘lambda.r’, ‘futile.options’, ‘gridGraphics’, ‘tweenr’, ‘polyclip’, ‘RcppEigen’, ‘lazyeval’, ‘plogr’, ‘png’, ‘Biostrings’, ‘futile.logger’, ‘snow’, ‘BH’, ‘cowplot’, ‘fastmatch’, ‘ggplotify’, ‘patchwork’, ‘ggforce’, ‘ape’, ‘tidytree’, ‘treeio’, ‘R.oo’, ‘R.methodsS3’, ‘BiocGenerics’, ‘Biobase’, ‘IRanges’, ‘RSQLite’, ‘S4Vectors’, ‘KEGGREST’, ‘BiocParallel’, ‘fgsea’, ‘reshape2’, ‘aplot’, ‘ggfun’, ‘ggnewscale’, ‘ggrepel’, ‘ggtangle’, ‘scatterpie’, ‘ggtree’, ‘R.utils’, ‘AnnotationD

In [None]:

# Load the libraries
library(clusterProfiler)
library(org.Hs.eg.db)
library(msigdbr)
library(enrichplot)
library(fgsea)

In [None]:
# Load the CSV file
dir("/content/")
data <- read.csv('/content/WT2.csv')
# Check the first few rows of the data
head(data)

In [None]:
# Example: If the CSV file has columns "Gene" and "Rank"
ranked_list <- data$Score
names(ranked_list) <- data$Gene

# Sort the ranked list in decreasing order
ranked_list <- sort(ranked_list, decreasing = TRUE)

In [None]:
# Run GSEA with GO terms
gsea_result <- gseGO(
    geneList = ranked_list,
    ont = "BP",                    # Biological Process
    OrgDb = org.Hs.eg.db,          # Organism database
    keyType = "SYMBOL",            # Gene identifier type
    exponent = 1,
    minGSSize = 10,
    maxGSSize = 500,
    pvalueCutoff = 0.05,
    verbose = TRUE
)

In [None]:
# View the results
head(gsea_result)

In [None]:
# Load MSigDB gene sets (e.g., Hallmark gene sets)
msigdb_sets <- msigdbr(species = "Homo sapiens", category = "H")

# Convert to a list of gene sets
gene_sets <- split(msigdb_sets$gene_symbol, msigdb_sets$gs_name)

# Run GSEA with fgsea
fgsea_result <- fgsea(
    pathways = gene_sets,
    stats = ranked_list,
    minSize = 10,
    maxSize = 500,
    eps = 0.0,
    scoreType = "std"
)

In [None]:
# View the results
head(fgsea_result[order(pval), ])

In [None]:

# Define your gene set and ranked list (replace with your actual data)
gene_set <- gene_sets[["HALLMARK_ANDROGEN_RESPONSE"]]
ranked_list <- ranked_list  # Your ranked gene list

# Calculate the enrichment score manually
es <- calcGseaStat(ranked_list, selectedStats = gene_set)

# Create a custom plot with 2pt line width
plot(1:length(es), es, type = "l", lwd = 2,
     xlab = "Position in Ranked List", ylab = "Enrichment Score",
     main = "HALLMARK_ANDROGEN_RESPONSE Enrichment Plot")