In [None]:
import pandas as pd
import numpy as np
import scanpy as sc
import pathlib as pl
from sklearn.preprocessing import StandardScaler

import seaborn as sns
import matplotlib.pyplot as plt

import signaturescoring as ssc

from tqdm.notebook import tqdm

from statannotations.Annotator import Annotator

In [None]:
from scipy.stats import fisher_exact

In [None]:
def pretty_ax(ax):
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.tick_params(
        axis='both',  
        which='both',      
        bottom=True,     
        top=False,
        left=False,
        labelbottom=True,
        labelleft = True)
    ax.spines["bottom"].set_linewidth(1.5)
    ax.spines["left"].set_linewidth(1.5)

In [None]:
signature_dir = pl.Path("/Users/josephineyates/Documents/VisitingPhd_VanAllenLab/cNMF_malignant_genes_new_cosine")

full_sigs = {}
for s in (signature_dir).iterdir():
    sig = s.stem
    full_sigs[sig] = pd.read_csv(s,index_col=0)
    _N_GENES = full_sigs[sig].shape[0]
    full_sigs[sig] = full_sigs[sig][~full_sigs[sig].index.str.startswith(("MT-","RPS","RPL"))]
    #full_sigs[sig] = full_sigs[sig].head(100).index.ravel()

In [None]:
science_path = pl.Path("/Users/josephineyates/Documents/VisitingPhd_VanAllenLab/auxiliary_data/ScienceNowickiFitzgerald_Barretts_data")

In [None]:
science_sigs = {}
for f in science_path.iterdir():
    if f.stem==".DS_Store":
        continue
    science_sigs[f.stem] = pd.read_csv(f,index_col=0)

In [None]:
from scipy.stats import fisher_exact

In [None]:
from matplotlib_venn import venn2

In [None]:
import gseapy as gp

In [None]:
gene_sets = {}
for state in science_sigs:
    if state=="BE_endocrine_specific":
        gene_sets[state] = science_sigs["BE_endocrine_specific"].index.ravel()
    elif "Symbol" in science_sigs[state].columns:
        gene_sets[state] = science_sigs[state]["Symbol"].ravel()
    else:
        gene_sets[state] = science_sigs[state]["Genename"].ravel()

In [None]:
from gseapy import gseaplot

In [None]:
gsea_results = {}

for state_name in sorted(full_sigs):
    
    pre_res = gp.prerank(rnk=full_sigs[state_name], # or rnk = rnk,
                         gene_sets=gene_sets,
                         min_size=5,
                         max_size=1000,
                         permutation_num=1000, # reduce number to speed up testing
                         outdir=None, # don't write to disk
                         seed=6,
                         verbose=True, # see what's going on behind the scenes
                        )
    gsea_results[state_name] = pre_res.res2d.sort_values(by="nes",ascending=False)
    gsea_results[state_name]["state"] = state_name
    
    selterms = gsea_results[state_name][gsea_results[state_name].nes>2.5].index
    for term in selterms:
        gseaplot(rank_metric=pre_res.ranking, term=term, ofname=f'figures/gsea_figures/{state_name}_{term}.pdf', **pre_res.results[term])

In [None]:
gsea_df = pd.concat(list(gsea_results.values()))

In [None]:
gsea_df[(gsea_df["nes"]>2.5) & (gsea_df["fdr"]<0.05)].dropna()