# Run GSEA on DE output

In [None]:
import pandas as pd
import numpy as np
import glob
from pybiomart import Dataset
import gseapy as gp

In [None]:
NUM_CPUS = 8
# DATA_PATH = '/data/expression_atlas/v1/GSE162828/'
DATA_PATH = '/data/expression_atlas/v1/GSE122459/'


RESULTS_PATH = '' + 'results/%s' % DATA_PATH.rstrip('/').split('/')[-1]

In [None]:
# Recover fh of all pydeseq2 output files 

de_transcript_files = glob.glob('%s_transcript*.csv' % RESULTS_PATH)
de_gene_files = glob.glob('%s_gene*.csv' % RESULTS_PATH)
de_transcript_files, de_gene_files

In [None]:
# Load DE dataframes

de_transcript_dfs = {k:pd.read_csv(k, index_col=0) for k in de_transcript_files}
de_gene_dfs = {k:pd.read_csv(k, index_col=0) for k in de_gene_files}

In [None]:
# Filter dataframes by gene_id, only keep ensembl gene ids.

gene_prefix = 'ENSG'

for k in de_gene_dfs.keys():
    de_gene_dfs[k] = de_gene_dfs[k][de_gene_dfs[k].index.str.startswith(gene_prefix)]

for k, v in de_transcript_dfs.items():
    de_transcript_dfs[k] = de_transcript_dfs[k][de_transcript_dfs[k]['gene_id'].str.startswith(gene_prefix)]


In [None]:
# Fetch ensembl gene id - external gene name mappings from biomart.

dataset = Dataset(
                name='hsapiens_gene_ensembl',
                host='http://www.ensembl.org',
    )

external_gene_mapping = dataset.query(
                                attributes=['ensembl_gene_id', 'external_gene_name']
    )

external_gene_mapping.rename({'Gene stable ID': 'gene_id', 'Gene name': 'gene_name'}, axis=1, inplace=True)

external_gene_mapping

In [None]:
# Convert ensembl gene ids to stable ids by removing version. Merge external gene name. 

for k in de_gene_dfs.keys():
    if 'gene_name' not in de_gene_dfs[k].columns:
        de_gene_dfs[k].index = de_gene_dfs[k].index.str.split('.').str[0]

        de_gene_dfs[k] = de_gene_dfs[k].merge(
                                            external_gene_mapping, 
                                            left_index=True, 
                                            right_on='gene_id',
                                        )
        de_gene_dfs[k].set_index('gene_id', inplace=True)


for k in de_transcript_dfs.keys():
    if 'gene_name' not in de_transcript_dfs[k].columns:

        de_transcript_dfs[k]['gene_id'] = de_transcript_dfs[k]['gene_id'].str.split('.').str[0]

        de_transcript_dfs[k] = de_transcript_dfs[k].merge(
                                                    external_gene_mapping,
                                                    on='gene_id',
                                                )


In [None]:
# Drop rows where there isn't a defined gene_name

for k in de_gene_dfs.keys():
    de_gene_dfs[k] = de_gene_dfs[k][~de_gene_dfs[k]['gene_name'].isnull()]

for k in de_transcript_dfs.keys():
    de_transcript_dfs[k] = de_transcript_dfs[k][~de_transcript_dfs[k]['gene_name'].isnull()]

In [None]:
# Create rank tables from dataframes using wald test statistic.

de_gene_ranks = {k: [
                v.reset_index()[['gene_name', 'stat']].sort_values('stat', axis=0),
                None,
                ] for k,v in de_gene_dfs.items()}

In [None]:
# Run GSEA prerank on rank dataframes using gene sets of choice.

for k in de_gene_dfs.keys():
    de_gene_ranks[k][1] = gp.prerank(
                                rnk=de_gene_ranks[k][0],
                                # gene_sets='KEGG_2016',
                                # gene_sets='GO_Biological_Process_2013', 
                                # gene_sets='ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X',
                                gene_sets='Reactome_2016',
                                threads=NUM_CPUS,
                                min_size=5,
                                max_size=1000,
                                permuation_num=1000,
                                outdir=None,
                                seed=42,
                                verbose=True
                                )

    print(k)
    print(de_gene_ranks[k][1].res2d.head(10).to_string())


In [None]:
# Visualize the top N pathways/results from prerank analysis.

for k in de_gene_dfs.keys():
    terms = de_gene_ranks[k][1].res2d.Term
    axs = de_gene_ranks[k][1].plot(terms=terms[0:20])