# Pathway enrichment

* Using `msigdbr` package in R to download GSEA pathway gene sets
* initially download Hallmark gene sets

In [1]:
#set wd
getwd()
setwd('/scratch/user/s4436039/scdata/Pathway-sets')
getwd()

In [2]:
library(msigdbr)

In [3]:
msigdbr_collections()

gs_cat,gs_subcat,num_genesets
<chr>,<chr>,<int>
C1,,299
C2,CGP,3384
C2,CP,29
C2,CP:BIOCARTA,292
C2,CP:KEGG,186
C2,CP:PID,196
C2,CP:REACTOME,1615
C2,CP:WIKIPATHWAYS,664
C3,MIR:MIRDB,2377
C3,MIR:MIR_Legacy,221


In [4]:
msigdbr_species()

species_name,species_common_name
<chr>,<chr>
Anolis carolinensis,"Carolina anole, green anole"
Bos taurus,"bovine, cattle, cow, dairy cow, domestic cattle, domestic cow, ox, oxen"
Caenorhabditis elegans,
Canis lupus familiaris,"dog, dogs"
Danio rerio,"leopard danio, zebra danio, zebra fish, zebrafish"
Drosophila melanogaster,fruit fly
Equus caballus,"domestic horse, equine, horse"
Felis catus,"cat, cats, domestic cat"
Gallus gallus,"bantam, chicken, chickens, Gallus domesticus"
Homo sapiens,human


In [5]:
#download human hallmark gene sets
h_gene_sets = msigdbr(species = "human", category = "H")
head(h_gene_sets)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
H,,HALLMARK_ADIPOGENESIS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ABCB8,11194,ENSG00000197150,ABCB8,11194,ENSG00000197150,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACAA2,10449,ENSG00000167315,ACAA2,10449,ENSG00000167315,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADL,33,ENSG00000115361,ACADL,33,ENSG00000115361,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADM,34,ENSG00000117054,ACADM,34,ENSG00000117054,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADS,35,ENSG00000122971,ACADS,35,ENSG00000122971,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).


In [7]:
unique(h_gene_sets$gs_name)

In [9]:
#seperate table based on gs_name (dataframe becomes a list of smaller dataframes based on gs_name)
h_sets_list <- split(h_gene_sets, h_gene_sets$gs_name)
head(h_sets_list$HALLMARK_ADIPOGENESIS)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
H,,HALLMARK_ADIPOGENESIS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ABCB8,11194,ENSG00000197150,ABCB8,11194,ENSG00000197150,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACAA2,10449,ENSG00000167315,ACAA2,10449,ENSG00000167315,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADL,33,ENSG00000115361,ACADL,33,ENSG00000115361,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADM,34,ENSG00000117054,ACADM,34,ENSG00000117054,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).
H,,HALLMARK_ADIPOGENESIS,ACADS,35,ENSG00000122971,ACADS,35,ENSG00000122971,M5905,26771021,,,,Genes up-regulated during adipocyte differentiation (adipogenesis).


In [None]:
#save as a excel file (each sheet will be a seperate gene set/ dataframe)
library(openxlsx)

In [None]:
# Assign numeric sheet names (keeping it as the gene_set names made the sheet names too long)
numeric_names <- paste0("Sheet", seq_along(h_sets_list))

In [None]:
# Write to Excel
write.xlsx(h_sets_list, file = "hallmark_sets.xlsx", sheetName = numeric_names)

### Next downloading select KEGG pathways:
* all C2 --> CP --> CP:KEGG_MEDICUS
* KEGG_MEDICUS_REFERENCE_ANTIGEN_PROCESSING_AND_PRESENTATION_BY_MHC_CLASS_I_MOLECULES
* KEGG_MEDICUS_REFERENCE_ANTIGEN_PROCESSING_AND_PRESENTATION_BY_MHC_CLASS_II_MOLECULES
* KEGG_MEDICUS_REFERENCE_CD80_CD86_CD28_PI3K_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_CD80_CD86_CTLA4_PP2A_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_CGAS_STING_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_CHOLESTEROL_BIOSYNTHESIS
* KEGG_MEDICUS_REFERENCE_CROSSTALK_BETWEEN_EXTRINSIC_AND_INTRINSIC_APOPTOTIC_PATHWAYS
* KEGG_MEDICUS_REFERENCE_CYTOKINE_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_EARLY_ENDOSOMAL_FUSION
* KEGG_MEDICUS_REFERENCE_EGF_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PATHWAY
* KEGG_MEDICUS_REFERENCE_GLYCOLYSIS
* KEGG_MEDICUS_REFERENCE_IL10_FAMILY_TO_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_IL12_23_TO_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_IL1_IL1R_JNK_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_IL1_IL1R_P38_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_IL2_FAMILY_TO_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_IL6_FAMILY_TO_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_INTRINSIC_APOPTOTIC_PATHWAY
* KEGG_MEDICUS_REFERENCE_LEWIS_X_ANTIGEN_BIOSYNTHESIS
* KEGG_MEDICUS_REFERENCE_LYSOSOMAL_CA2_RELEASE
* KEGG_MEDICUS_REFERENCE_MISMATCH_REPAIR
* KEGG_MEDICUS_REFERENCE_NOTCH_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TGF_BETA_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR1_2_4_NFKB_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR2_4_MAPK_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR3_IRF3_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR3_IRF7_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR3_NFKB_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR4_IRF3_7_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR5_NFKB_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR7_8_9_IRF5_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TLR7_9_IRF7_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TNF_JNK_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TNF_NFKB_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TYPE_I_IFN_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TYPE_I_INTERFERON_TO_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MEDICUS_REFERENCE_TYPE_II_INTERFERON_TO_JAK_STAT_IGNALING_PATHWAY

### found out msigdbr just has Kegg legacy sets (186 sets)
* all C2 --> CP --> CP:KEGG_LEGACY
* KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY
* KEGG_ANTIGEN_PROCESSING_AND_PRESENTATION
* KEGG_APOPTOSIS
* KEGG_CALCIUM_SIGNALING_PATHWAY
* KEGG_CELL_ADHESION_MOLECULES_CAMS
* KEGG_CHEMOKINE_SIGNALING_PATHWAY
* KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION
* KEGG_CYTOSOLIC_DNA_SENSING_PATHWAY
* KEGG_ENDOCYTOSIS
* KEGG_FATTY_ACID_METABOLISM
* KEGG_FC_GAMMA_R_MEDIATED_PHAGOCYTOSIS
* KEGG_GLYCOLYSIS_GLUCONEOGENESIS
* KEGG_JAK_STAT_SIGNALING_PATHWAY
* KEGG_MAPK_SIGNALING_PATHWAY
* KEGG_NOD_LIKE_RECEPTOR_SIGNALING_PATHWAY
* KEGG_NOTCH_SIGNALING_PATHWAY
* KEGG_OXIDATIVE_PHOSPHORYLATION
* KEGG_PROTEASOME
* KEGG_PROTEIN_EXPORT
* KEGG_TGF_BETA_SIGNALING_PATHWAY
* KEGG_TOLL_LIKE_RECEPTOR_SIGNALING_PATHWAY
* KEGG_TYROSINE_METABOLISM
* KEGG_UBIQUITIN_MEDIATED_PROTEOLYSIS
* KEGG_VEGF_SIGNALING_PATHWAY


In [4]:
msigdbr_collections()

gs_cat,gs_subcat,num_genesets
<chr>,<chr>,<int>
C1,,299
C2,CGP,3384
C2,CP,29
C2,CP:BIOCARTA,292
C2,CP:KEGG,186
C2,CP:PID,196
C2,CP:REACTOME,1615
C2,CP:WIKIPATHWAYS,664
C3,MIR:MIRDB,2377
C3,MIR:MIR_Legacy,221


In [5]:
# download human KEGG gene sets
kegg_sets = msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG")
head(kegg_sets)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA10,10349,ENSG00000154263,ABCA10,10349,ENSG00000154263,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA12,26154,ENSG00000144452,ABCA12,26154,ENSG00000144452,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA13,154664,ENSG00000179869,ABCA13,154664,ENSG00000179869,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA2,20,ENSG00000107331,ABCA2,20,ENSG00000107331,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA3,21,ENSG00000167972,ABCA3,21,ENSG00000167972,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters


In [6]:
unique(kegg_sets$gs_name)

In [7]:
#seperate table based on gs_name (dataframe becomes a list of smaller dataframes based on gs_name)
kegg_sets_list <- split(kegg_sets, kegg_sets$gs_name)
head(kegg_sets_list$KEGG_ABC_TRANSPORTERS)

gs_cat,gs_subcat,gs_name,gene_symbol,entrez_gene,ensembl_gene,human_gene_symbol,human_entrez_gene,human_ensembl_gene,gs_id,gs_pmid,gs_geoid,gs_exact_source,gs_url,gs_description
<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA1,19,ENSG00000165029,ABCA1,19,ENSG00000165029,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA10,10349,ENSG00000154263,ABCA10,10349,ENSG00000154263,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA12,26154,ENSG00000144452,ABCA12,26154,ENSG00000144452,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA13,154664,ENSG00000179869,ABCA13,154664,ENSG00000179869,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA2,20,ENSG00000107331,ABCA2,20,ENSG00000107331,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters
C2,CP:KEGG,KEGG_ABC_TRANSPORTERS,ABCA3,21,ENSG00000167972,ABCA3,21,ENSG00000167972,M11911,,,hsa02010,http://www.genome.jp/kegg/pathway/hsa/hsa02010.html,ABC transporters


In [8]:
#save as a excel file (each sheet will be a seperate gene set/ dataframe)
library(openxlsx)

In [9]:
# Assign numeric sheet names (keeping it as the gene_set names made the sheet names too long)
numeric_names <- paste0("Sheet", seq_along(kegg_sets_list))

In [10]:
# Write to Excel
write.xlsx(kegg_sets_list, file = "kegg_sets.xlsx", sheetName = numeric_names)