# Pathway enrichment

* Using `msigdbr` package in R to download GSEA pathway gene sets
* initially download Hallmark gene sets

In [1]:
#set wd
getwd()
setwd('/scratch/user/s4436039/scdata/Pathway-sets')
getwd()

In [2]:
library(msigdbr)

In [3]:
msigdbr_collections()

gs_cat,gs_subcat,num_genesets
<chr>,<chr>,<int>
C1,,299
C2,CGP,3384
C2,CP,29
C2,CP:BIOCARTA,292
C2,CP:KEGG,186
C2,CP:PID,196
C2,CP:REACTOME,1615
C2,CP:WIKIPATHWAYS,664
C3,MIR:MIRDB,2377
C3,MIR:MIR_Legacy,221


In [4]:
msigdbr_species()

species_name,species_common_name
<chr>,<chr>
Anolis carolinensis,"Carolina anole, green anole"
Bos taurus,"bovine, cattle, cow, dairy cow, domestic cattle, domestic cow, ox, oxen"
Caenorhabditis elegans,
Canis lupus familiaris,"dog, dogs"
Danio rerio,"leopard danio, zebra danio, zebra fish, zebrafish"
Drosophila melanogaster,fruit fly
Equus caballus,"domestic horse, equine, horse"
Felis catus,"cat, cats, domestic cat"
Gallus gallus,"bantam, chicken, chickens, Gallus domesticus"
Homo sapiens,human


In [5]:
#download human hallmark gene sets
h_gene_sets = msigdbr(species = "human", category = "H")
head(h_gene_sets)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
H,,HALLMARK_ADIPOGENESIS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ABCB8,11194,ENSG00000197150,ABCB8,11194,ENSG00000197150,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACAA2,10449,ENSG00000167315,ACAA2,10449,ENSG00000167315,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADL,33,ENSG00000115361,ACADL,33,ENSG00000115361,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADM,34,ENSG00000117054,ACADM,34,ENSG00000117054,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADS,35,ENSG00000122971,ACADS,35,ENSG00000122971,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).


In [7]:
unique(h_gene_sets$gs_name)

In [9]:
#seperate table based on gs_name (dataframe becomes a list of smaller dataframes based on gs_name)
h_sets_list <- split(h_gene_sets, h_gene_sets$gs_name)
head(h_sets_list$HALLMARK_ADIPOGENESIS)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
H,,HALLMARK_ADIPOGENESIS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ABCB8,11194,ENSG00000197150,ABCB8,11194,ENSG00000197150,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACAA2,10449,ENSG00000167315,ACAA2,10449,ENSG00000167315,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADL,33,ENSG00000115361,ACADL,33,ENSG00000115361,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADM,34,ENSG00000117054,ACADM,34,ENSG00000117054,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADS,35,ENSG00000122971,ACADS,35,ENSG00000122971,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).


In [14]:
#save as a excel file (each sheet will be a seperate gene set/ dataframe)
library(openxlsx)

# Assign numeric sheet names (keeping it as the gene_set names made the sheet names too long)
numeric_names <- paste0("Sheet", seq_along(h_sets_list))

# Write to Excel
write.xlsx(h_sets_list, file = "hallmark_sets.xlsx", sheetName = numeric_names)