In [1]:
#' @param X matrix. Rows are genes. Columns are samples. Row names are symbols.
#' @param gene_sets list. Each element is a string vector with gene symbols.
#' @param alpha numeric. Parameter for ssGSEA, the default is 0.25
#' @param scale logical. If True, normalize the scores by number of genes in the gene sets.
#' @param norm logical. If True, normalize the scores by the absolute difference between max and min values.
#' @param single logical. If True, use ssGSEA algorithm, otherwise use GSEA.
#'
#' @return matrix containing enrichment scroes. Rows are gene sets, columns are samples.
#'
#' @examples
#' # Create a fake matrix
# m = 100
# n = 100
# set.seed(1)
# X = matrix(rnorm(m*n), m, n)
# # Assign 'gene symbols' to row names
# rownames(X) = 1:m
# # Create 3 gene sets
# gene_sets = list(a = sample(m, 5), b = sample(m, 5), c = sample(m, 5))
# system.time(assign('a', GSVA::gsva(X, gene_sets, method = 'ssgsea')))
# system.time(assign('b', ssgsea(X, gene_sets, scale = F, norm = T)))
# identical(a, b)

ssgsea = function(X, gene_sets, alpha = 0.25, scale = T, norm = F, single = T) {
  row_names = rownames(X)
  num_genes = nrow(X)
  gene_sets = lapply(gene_sets, function(genes) {which(row_names %in% genes)})
  
  # Ranks for genes
  R = matrixStats::colRanks(X, preserveShape = T, ties.method = 'average')
  
  # Calculate enrichment score (es) for each sample (column)
  es = apply(R, 2, function(R_col) {
    gene_ranks = order(R_col, decreasing = TRUE)
    
    # Calc es for each gene set
    es_sample = sapply(gene_sets, function(gene_set_idx) {
      # pos: match (within the gene set)
      # neg: non-match (outside the gene set)
      indicator_pos = gene_ranks %in% gene_set_idx
      indicator_neg = !indicator_pos
      
      rank_alpha  = (R_col[gene_ranks] * indicator_pos) ^ alpha
      
      step_cdf_pos = cumsum(rank_alpha)    / sum(rank_alpha)
      step_cdf_neg = cumsum(indicator_neg) / sum(indicator_neg)
      
      step_cdf_diff = step_cdf_pos - step_cdf_neg
      
      # Normalize by gene number
      if (scale) step_cdf_diff = step_cdf_diff / num_genes
      
      # Use ssGSEA or not
      if (single) {
        sum(step_cdf_diff)
      } else {
        step_cdf_diff[which.max(abs(step_cdf_diff))]
      }
    })
    unlist(es_sample)
  })
  
  if (length(gene_sets) == 1) es = matrix(es, nrow = 1)
  
  # Normalize by absolute diff between max and min
  if (norm) es = es / diff(range(es))
  
  # Prepare output
  rownames(es) = names(gene_sets)
  colnames(es) = colnames(X)
  return(es)
}


# setwd("~/github/GSEApy/tests/data")



In [2]:
X2 = read.table("./data/P53_resampling_data2.txt", row.names = 1, header = T,sep="\t", stringsAsFactors = F)
X3 = as.matrix.data.frame(X2)

In [3]:
gm = c('GRB14',
      'KAZALD1',
      'POLR2I',
      'C7orf26',
      'MYOZ3',
      'CRYBA4',
      'C9orf85',
      'PRPS1',
      'C9',
      'GTF2H4',
      'PSME2',
      'HAUS4',
      'VPS16',
      'SCOC',
      'RHAG',
      'AIF1',
      'RPL41',
      'C16orf5',
      'LCT',
      'C1orf83',
      'GFAP',
      'NUDCD3',
      'ROGDI',
      'HEATR1',
      'MST1R',
      'ZMPSTE24',
      'HDAC1',
      'NEO1',
      'POLR3A',
      'VPS54',
      'F5',
      'QKI',
      'ITFG2',
      'PPP2R3A',
      'LIMS2',
      'PCDH15',
      'STOML2',
      'FLT3',
      'GABRR1',
      'GNPDA2',
      'PHLDA3',
      'RARS',
      'MRPS33',
      'LCK',
      'PTN',
      'HRG',
      'EIF3I',
      'PMVK',
      'UBOX5',
      'VN2R1P',
      'STAP2',
      'CCNB3',
      'ADAM8',
      'LHCGR',
      'PERP',
      'COL1A2',
      'ZSWIM1',
      'BCAP29',
      'PTP4A3',
      'PIP4K2A',
      'PRRX2',
      'UHRF1',
      'CEBPZ',
      'UBE2J1',
      'WFDC2',
      'SGK2',
      'ZBED3',
      'CCDC82',
      'TMOD1',
      'CD2AP',
      'C6orf203',
      'TMEM85')

In [4]:
gene_sets = list(raondom2=gm)

In [5]:
gene_sets

In [6]:
es = ssgsea(X3, gene_sets)

In [7]:
length(es)

In [8]:
for (i in 1:length(es)){
    
    outv = paste(i, colnames(es)[i], "-ES:", es[i], sep=" ")
    print(outv)
}

[1] "1 X786.0 -ES: 0.176040295089046"
[1] "2 BT.549 -ES: 0.160913616155238"
[1] "3 CCRF.CEM -ES: 0.147963854795492"
[1] "4 COLO.205 -ES: 0.176468809957333"
[1] "5 EKVX -ES: 0.146663063421088"
[1] "6 HCC.2998 -ES: 0.156569211831362"
[1] "7 HCT.15 -ES: 0.140825209754476"
[1] "8 HOP.62 -ES: 0.178075669242335"
[1] "9 HOP.92 -ES: 0.200200257331915"
[1] "10 HS.578T -ES: 0.146164722504163"
[1] "11 HT29 -ES: 0.135000382605928"
[1] "12 K.562 -ES: 0.153856709977515"
[1] "13 KM12 -ES: 0.12565788023811"
[1] "14 M14 -ES: 0.228209312684419"
[1] "15 MDA.MB.231.ATCC -ES: 0.150090009844482"
[1] "16 MDA.MB.435 -ES: 0.199237915009958"
[1] "17 NCI.H23 -ES: 0.170557878882356"
[1] "18 NCI.H322M -ES: 0.149759169112018"
[1] "19 NCI.H522 -ES: 0.164701748918865"
[1] "20 OVCAR.3 -ES: 0.121866759155603"
[1] "21 OVCAR.5 -ES: 0.0959323070472887"
[1] "22 OVCAR.8 -ES: 0.189306240692129"
[1] "23 PC.3 -ES: 0.00528292511249993"
[1] "24 RXF.393 -ES: 0.127406069548041"
[1] "25 SF.268 -ES: 0.146597346217401"
[1] "26 SF.295