# Rank-based enrichment analysis of genes per topic

## Load required library

In [1]:
import pandas as pd
import gseapy as gp
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm import tqdm

In [2]:
gp.__version__

'0.10.8'

## Select appropriate databases from the list of databases available in Enrichr

In [3]:
human = gp.get_library_name(organism='Human')
len(human), human#[:10]

(197,
 ['ARCHS4_Cell-lines',
  'ARCHS4_IDG_Coexp',
  'ARCHS4_Kinases_Coexp',
  'ARCHS4_TFs_Coexp',
  'ARCHS4_Tissues',
  'Achilles_fitness_decrease',
  'Achilles_fitness_increase',
  'Aging_Perturbations_from_GEO_down',
  'Aging_Perturbations_from_GEO_up',
  'Allen_Brain_Atlas_10x_scRNA_2021',
  'Allen_Brain_Atlas_down',
  'Allen_Brain_Atlas_up',
  'Azimuth_Cell_Types_2021',
  'BioCarta_2013',
  'BioCarta_2015',
  'BioCarta_2016',
  'BioPlanet_2019',
  'BioPlex_2017',
  'CCLE_Proteomics_2020',
  'CORUM',
  'COVID-19_Related_Gene_Sets',
  'COVID-19_Related_Gene_Sets_2021',
  'Cancer_Cell_Line_Encyclopedia',
  'CellMarker_Augmented_2021',
  'ChEA_2013',
  'ChEA_2015',
  'ChEA_2016',
  'Chromosome_Location',
  'Chromosome_Location_hg19',
  'ClinVar_2019',
  'DSigDB',
  'Data_Acquisition_Method_Most_Popular_Genes',
  'DepMap_WG_CRISPR_Screens_Broad_CellLines_2019',
  'DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019',
  'Descartes_Cell_Types_and_Tissue_2021',
  'DisGeNET',
  'Disease_Perturb

## Load gene and ranks data topic wise

In [4]:
rank_by_topic = pd.read_csv("Results/Rank_by_topic.csv")

rank_by_topic.head()

Unnamed: 0,topic_0,topic_0_prop,topic_1,topic_1_prop,topic_2,topic_2_prop,topic_3,topic_3_prop,topic_4,topic_4_prop,topic_5,topic_5_prop,topic_6,topic_6_prop,topic_7,topic_7_prop,topic_8,topic_8_prop,topic_9,topic_9_prop
0,FTL,0.045098,CD74,0.043416,IGKC,0.357901,KRT14,0.068934,MALAT1,0.03837,S100A8,0.061923,MALAT1,0.401556,B2M,0.020044,RPL41,0.020805,KRT10,0.045418
1,FTH1,0.043526,TMSB4X,0.033774,IGLC2,0.110024,KRT5,0.014258,B2M,0.030612,S100A9,0.037184,NEAT1,0.014969,MALAT1,0.01962,RPLP1,0.018459,KRT1,0.039266
2,B2M,0.020449,HLA-DRA,0.033032,IGLC3,0.06918,B2M,0.013784,TMSB4X,0.025853,KRT16,0.024092,MT-CO1,0.0089,FTH1,0.012619,RPS18,0.01831,DMKN,0.017053
3,TMSB4X,0.017277,B2M,0.022879,IGHG3,0.067601,MALAT1,0.010009,ACTB,0.013113,KRT17,0.022191,RPS27,0.005407,LGALS1,0.006454,RPL10,0.018259,LY6D,0.011399
4,MALAT1,0.015034,ACTB,0.021814,IGHG1,0.047243,RPLP1,0.009915,HLA-B,0.007554,KRT14,0.020755,MT-CO3,0.005208,S100A6,0.00621,RPL34,0.018172,NEAT1,0.011235


In [5]:
rkn_DF_list = []
for i in list(rank_by_topic):
    if "prop" in i:
        Topic, score = i.replace("_prop", ""), i
        df = rank_by_topic[[Topic, score]]
        rkn_DF_list.append(df)


## Enrichment function

In [6]:
%%time
Outdir = "Results/Enrichment/"

def Enrich_topic(rkn_topic_DF_list, Gene_set, Print=False):
    for topic_rkn_list in rkn_topic_DF_list:
        # print(topic_rkn_list)
        Folder = Outdir + Gene_set
        File = Folder +"/"+ list(topic_rkn_list)[0] + ".tsv"
        Path(Folder).mkdir(parents=True, exist_ok=True)
        # print(File, Folder)
        try:
            pre_res = gp.prerank(rnk=topic_rkn_list, gene_sets=Gene_set,
                         processes=4,
                         permutation_num=10, # reduce number to speed up testing
                         outdir='temp/temp', format='png', seed=6)
        
            terms = pre_res.res2d.index

            Top_10 = pre_res.res2d.head()
            # print(type(Top_10))
            Top_10.to_csv(File, sep="\t")
        except:
            print(f"Error:{Gene_set}")
            return
        
        # break
    print(Gene_set)

    
# Enrich_topic(rkn_DF_list, 'Azimuth_Cell_Types_2021')

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 11.4 µs


## Enrichment analysis

In [7]:
Enrich_topic(rkn_DF_list, 'Azimuth_Cell_Types_2021')

In [8]:
Enrich_topic(rkn_DF_list, 'Genes_Associated_with_NIH_Grants')

In [10]:
Enrich_topic(rkn_DF_list, 'CellMarker_Augmented_2021')

# END