In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import gseapy as gp
import logging

In [2]:
gp.__version__

'0.8.9'

In [3]:
def enrichment_score_ss(gene_set, rnkseries, weighted_score_type=0.25, scale=True, esnull=None, rs=np.random.RandomState()):
    """
    Given a gene set, a map of gene names to rank levels, and a weight score, returns the ssGSEA
    enrichment score for the gene set as described by *D. Barbie et al 2009*

    ssGSEA  allows one to define an enrichment score that represents the degree of absolute enrichment
    of a gene set in each sample within a given data set.
    The  enrichment score was produced using the Empirical Cumulative Distribution Functions (ECDF)
    of the genes in the signature and the remaining genes.

    :requires: every member of gene_set is a key in rnkseries
    :param gene_set: a list of gene_names in the gene_set given by gmt file.
    :param rnkseries: pd.Series, an indexed series with rank values.
    :param weighted_score_type: the weighted exponent on the :math:`P^W_G` term.
    :param scale: If True, normalize the scores by number of genes in the gene sets.
    :returns:
             ES: Enrichment score (real number between -1 and +1),take the sum of all values in the RES array .

             hit_index: index of a gene in gene_list, if gene included in gene_set.

             RES: Numerical vector containing the running enrichment score for all locations in the gene list .

    """

    # input is a pd.Series, already sorted ?
    rnkseries = rnkseries.sort_values(ascending=False)
    keys_sorted = rnkseries.index.values

    # transform the normalized expression data for a single sample into ranked (in decreasing order)
    # expression values
    # see: http://rowley.mit.edu/caw_web/ssGSEAProjection/ssGSEAProjection.Library.R
    if weighted_score_type == 0:
        # don't bother doing calcuation, just set to 1
        ranked_index = np.repeat(1, N)
    elif weighted_score_type > 0:
        # calculate z.score of normalized (e.g., ranked) expression values
        # used by ssGSEAProjection
        # _ranked = (rnkseries- rnkseries.mean())/rnkseries.std()
        _ranked = rnkseries.values
    else:
        logging.warning("Using negative values of weighted_score_type, not allowed")
        sys.exit(0)

    #integrate different in P_GW and P_NG
    axis = 0
    #speed up using numpy array
    tag_indicator = np.in1d(keys_sorted, gene_set, assume_unique=True).astype(int)
    hit_ind = np.flatnonzero(tag_indicator).tolist()
    N =len(tag_indicator)
    Nhint = tag_indicator.sum()
    Nm =  N - Nhint

    index = np.abs(_ranked)

    if esnull:
        axis=1
        tag_indicator = np.tile(tag_indicator, (esnull,1))
        index = np.tile(index, (esnull,1))

        # gene list permutation
        for i in range(esnull):
            rs.shuffle(tag_indicator[i])
    # genes not in the input rnkseries
    no_tag_indicator = 1 - tag_indicator

    # calculate numerator, denominator of each gene hits
    rank_alpha = (tag_indicator*index)** weighted_score_type

    P_GW_numerator = np.cumsum(rank_alpha, axis=axis)
    P_GW_denominator = np.sum(rank_alpha, axis=axis, keepdims=True)

    P_NG_numerator = np.cumsum(no_tag_indicator, axis=axis)
    P_NG_denominator = np.sum(no_tag_indicator, axis=axis, keepdims=True)

    RES = P_GW_numerator / P_GW_denominator - P_NG_numerator / P_NG_denominator
    # scale es by gene numbers ?
    # https://gist.github.com/gaoce/39e0907146c752c127728ad74e123b33
    if scale:
        RES = RES / N

    es = np.sum(RES, axis=axis)

    if esnull:
        return es.tolist()

    return es.tolist(), hit_ind, RES.tolist()

In [4]:
gex = pd.read_table("./data/P53_resampling_data2.txt", index_col=0)

In [5]:
gex.head()

Unnamed: 0_level_0,786-0,BT-549,CCRF-CEM,COLO 205,EKVX,HCC-2998,HCT-15,HOP-62,HOP-92,HS 578T,...,MCF7,MOLT-4,NCI-H460,OVCAR-4,SF-539,SK-MEL-5,SR,UACC-257,UACC-62,UO-31
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CTLA2B,111.19,86.22,121.85,75.19,208.62,130.59,124.72,324.09,242.71,89.71,...,163.76,59.5,134.12,152.09,197.46,137.79,81.53,123.37,81.41,180.78
SCARA3,460.3,558.34,183.55,37.29,158.0,43.61,80.83,300.08,1250.25,144.82,...,109.91,120.42,73.06,115.03,95.12,37.56,76.16,41.1,77.51,519.17
LOC100044683,97.25,118.94,81.17,119.51,119.88,107.73,165.57,203.97,135.43,171.09,...,222.84,124.98,114.75,141.66,170.19,147.7,157.48,152.18,98.89,118.06
CMBL,33.45,55.1,221.67,50.3,35.12,75.7,84.01,44.12,79.96,29.01,...,51.32,117.11,59.46,78.46,45.55,49.07,96.69,33.09,10.38,52.89
CLIC6,35.75,41.26,63.04,219.86,42.53,54.19,86.98,71.2,53.89,98.86,...,154.05,31.62,37.66,32.64,63.35,27.95,70.99,36.25,17.5,49.41


In [6]:
gmt = gp.parser.gsea_gmt_parser("./data/randomSets.gmt")

In [7]:
gmt.keys()

dict_keys(['random1', 'random2', 'random3', 'random4', 'random5', 'level2_rand', 'level4_rand', 'level6_rand', 'level8_rand', 'level10_RAND', 'level12_random'])

In [8]:
gm = gmt['random2']

In [9]:
print(gm)

['GRB14', 'KAZALD1', 'POLR2I', 'C7orf26', 'MYOZ3', 'CRYBA4', 'C9orf85', 'PRPS1', 'C9', 'GTF2H4', 'PSME2', 'HAUS4', 'VPS16', 'SCOC', 'RHAG', 'AIF1', 'RPL41', 'C16orf5', 'LCT', 'C1orf83', 'GFAP', 'NUDCD3', 'ROGDI', 'HEATR1', 'MST1R', 'ZMPSTE24', 'HDAC1', 'NEO1', 'POLR3A', 'VPS54', 'F5', 'QKI', 'ITFG2', 'PPP2R3A', 'LIMS2', 'PCDH15', 'STOML2', 'FLT3', 'GABRR1', 'GNPDA2', 'PHLDA3', 'RARS', 'MRPS33', 'LCK', 'PTN', 'HRG', 'EIF3I', 'PMVK', 'UBOX5', 'VN2R1P', 'STAP2', 'CCNB3', 'ADAM8', 'LHCGR', 'PERP', 'COL1A2', 'ZSWIM1', 'BCAP29', 'PTP4A3', 'PIP4K2A', 'PRRX2', 'UHRF1', 'CEBPZ', 'UBE2J1', 'WFDC2', 'SGK2', 'ZBED3', 'CCDC82', 'TMOD1', 'CD2AP', 'C6orf203', 'TMEM85']


In [10]:
names=[]
es = []
hits = []
gexrnk = gex.rank(axis=0, method='average', na_option='bottom')
for name, ser in gexrnk.iteritems():
    est = enrichment_score_ss(gm, ser)[0]
    es.append(est)
    names.append(name)

In [11]:
for n, e  in zip(names, es):
    print(n, ": ", e)

786-0 :  0.1760402950890455
BT-549 :  0.16091361615523808
CCRF-CEM :  0.1479638547954917
COLO 205 :  0.17646880995733327
EKVX :  0.14666306342108795
HCC-2998 :  0.15656921183136224
HCT-15 :  0.140825209754476
HOP-62 :  0.1780756692423349
HOP-92 :  0.20020025733191446
HS 578T :  0.14616472250416346
HT29 :  0.13500038260592784
K-562 :  0.15385670997751538
KM12 :  0.12565788023810984
M14 :  0.2282093126844192
MDA-MB-231/ATCC :  0.15009000984448168
MDA-MB-435 :  0.19923791500995813
NCI-H23 :  0.17022115648177338
NCI-H322M :  0.14975916911201828
NCI-H522 :  0.1647017489188652
OVCAR-3 :  0.1218667591556035
OVCAR-5 :  0.09593230704728868
OVCAR-8 :  0.18930624069212929
PC-3 :  0.005282925112499934
RXF-393 :  0.12740606954804146
SF-268 :  0.14659734621740128
SF-295 :  0.2184909989493214
SK-MEL-2 :  0.21369778813076545
SN12C :  0.14884469203913359
SNB-19 :  0.19039359754056326
SNB-75 :  0.19304375609613553
SW-620 :  0.12421173989770762
T-47D :  0.21318539349281873
U251 :  0.12962389815145048
A49

In [12]:
## no scale es
names=[]
es = []
hits = []
gexrnk = gex.rank(axis=0, method='average', na_option='bottom')
for name, ser in gexrnk.iteritems():
    est = enrichment_score_ss(gm, ser, scale=False)[0]
    es.append(est)
    names.append(name)

In [13]:
## no scale es values and norm by all samples
es = np.array(es)
nes = es/(es.max() -es.min())
for n, e  in zip(names, nes):
    print(n, ": ", e)

786-0 :  0.789679037132
BT-549 :  0.721823997185
CCRF-CEM :  0.663734142948
COLO 205 :  0.791601262997
EKVX :  0.657899071611
HCC-2998 :  0.702335930424
HCT-15 :  0.631711711154
HOP-62 :  0.798809289389
HOP-92 :  0.898055450108
HS 578T :  0.655663621055
HT29 :  0.605582784866
K-562 :  0.690168228415
KM12 :  0.563674321406
M14 :  1.02369806989
MDA-MB-231/ATCC :  0.673271618848
MDA-MB-435 :  0.893738588689
NCI-H23 :  0.763575628421
NCI-H322M :  0.671787538224
NCI-H522 :  0.73881674894
OVCAR-3 :  0.546668164693
OVCAR-5 :  0.430331770465
OVCAR-8 :  0.849187226124
PC-3 :  0.0236980698877
RXF-393 :  0.571516324002
SF-268 :  0.657604278319
SF-295 :  0.980103797173
SK-MEL-2 :  0.958602480659
SN12C :  0.667685390053
SNB-19 :  0.854064875918
SNB-75 :  0.865952919252
SW-620 :  0.557187245757
T-47D :  0.956303988123
U251 :  0.581465027821
A498 :  0.644970664426
A549/ATCC :  0.717646405734
ACHN :  0.702100211412
CAKI-1 :  0.626173573006
HCT-116 :  0.718349928287
LOX IMVI :  0.864177206092
MALME-3M 