In [1]:
# install.packages("DistMap", repos='http://cran.us.r-project.org')
# library(devtools)
# install_github("rajewsky-lab/DistMap")


In [2]:
#Sets up the enviroment for scoring
#Defines the scoring metrics

library(DistMap)
library(purrr)
library(dplyr)
library(mccr)
#library(synapser)


Loading required package: ggplot2
"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

"package 'mccr' was built under R version 3.6.3"

In [3]:
#Use Attila's code to donwload, load data and initialize the environment for scoring <- syn16782361
initialize <- function(){
  if(!file.exists("init.RData")){
    if(!all(file.exists(c("dge_raw.txt.gz","dge_normalized.txt.gz","binarized_bdtnp.csv.gz","bdtnp.txt.gz","geometry.txt.gz")))){
      download.file("http://bimsbstatic.mdc-berlin.de/rajewsky/DVEX/dge_raw.txt.gz",destfile = "dge_raw.txt.gz")
      download.file("http://bimsbstatic.mdc-berlin.de/rajewsky/DVEX/dge_normalized.txt.gz",destfile = "dge_normalized.txt.gz")
      download.file("http://bimsbstatic.mdc-berlin.de/rajewsky/DVEX/binarized_bdtnp.csv.gz",destfile = "binarized_bdtnp.csv.gz")
      download.file("http://bimsbstatic.mdc-berlin.de/rajewsky/DVEX/bdtnp.txt.gz",destfile = "bdtnp.txt.gz")
      download.file("http://bimsbstatic.mdc-berlin.de/rajewsky/DVEX/geometry.txt.gz",destfile = "geometry.txt.gz")
    }
    
    raw.data = read.table(gzfile("dge_raw.txt.gz",'rt'),
                          sep = "\t",
                          row.names = NULL,
                          stringsAsFactors = F,
                          quote = "")
    raw.data.genes = raw.data$V1
    raw.data$V1 = NULL
    
    raw.data.genes = gsub("'","",raw.data.genes,fixed = T)
    
    raw.data = as.matrix(raw.data)
    rownames(raw.data) = raw.data.genes
    
    normalized.data = read.table(gzfile("dge_normalized.txt.gz",'rt'),
                                 sep = "\t",
                                 row.names = NULL,
                                 stringsAsFactors = F,
                                 quote = "")
    
    normalized.data.genes = normalized.data$row.names
    normalized.data$row.names = NULL
    
    normalized.data.genes = gsub("'","",normalized.data.genes,fixed = T)
    
    normalized.data = as.matrix(normalized.data)
    rownames(normalized.data) = normalized.data.genes
    
    stopifnot(all(normalized.data.genes == raw.data.genes))
    
    insitu.matrix = read.table(gzfile("binarized_bdtnp.csv.gz",'rt'), sep = ",",header = T)
    
    insitu.genes_orig <- colnames(insitu.matrix)
    
    #this is not needed for the normalized data
    insitu.genes = gsub(".","-",insitu.genes_orig,fixed = T)
    insitu.genes = gsub("-spl-","(spl)",insitu.genes,fixed = T)
    
    insitu.matrix = as.matrix(insitu.matrix)
    colnames(insitu.matrix) = insitu.genes
    
    stopifnot(all(insitu.genes %in% raw.data.genes))
    
    geometry = read.csv(gzfile("geometry.txt.gz",'rt'),sep = " ")
    
    colnames(geometry) = c("x","y","z")
    
    #close gz properly
    #closeAllConnections()
    
    dm <<- new("DistMap",
             raw.data=raw.data,
             data=normalized.data,
             insitu.matrix=insitu.matrix,
             geometry=as.matrix(geometry))
    
    dm <<- binarizeSingleCellData(dm, seq(0.15, 0.5, 0.01))
    
    dm <<- mapCells(dm)
    #Thank you Attila!
    
    #GROUND TRUTH
    
    ground.truth <<- t(apply(dm@mcc.scores,2,order,decreasing=TRUE))[,1:10]
    ambig.locations <<- t(apply(dm@mcc.scores,2,sort,decreasing=TRUE))[,1:2]
    ambig.locations <<- which(ambig.locations[,1] == ambig.locations[,2])
    
    
    #map every cell to its d84 value
    d84 <<- seq(nrow(ground.truth)) %>% map_dbl(function(j){
      #map every position to the norm of the difference in the geometry and calculate the mean
      ground.truth[j,] %>% map_dbl(~sqrt(sum((dm@geometry[.x,] - dm@geometry[ground.truth[j,1],])^2))) %>% mean
    })
    
    save(dm,ground.truth,d84,ambig.locations,file="init.RData")
    
  } else {
    load("init.RData",envir = .GlobalEnv)
  }
}


#Scoring function. 
#Input: path to the results .csv file (character), number of subchallenge (integer)
#Output: vector of scores (s1,s2,s3)
score <- function(path,sub){
  
  if (!exists("dm")) initialize()
  
  submission <- read.csv(path,header=FALSE,stringsAsFactors = FALSE)
  
  #separate the gene names from the location predictions
  gene.lines <- (4-sub)*2
  genes <- submission %>% slice(1:gene.lines)
  locations <- submission %>% slice(-1:-gene.lines)
  
  #preprocess genes and locations, remove NAs, sort locations by cellid
  genes <- genes %>% select(-1) %>% unlist %>% as.character
  locations <- locations[order(locations[,1]),] %>% select(-1) %>% apply(2,as.numeric)

  #fix incompatibility
  genes = gsub("-",".",genes,fixed = T)
  genes = gsub("(spl)",".spl.",genes,fixed = T)
    
  #do the same mapping for the submission as for d48
  dsub <- seq(nrow(locations)) %>% map_dbl(function(j){
    vals <- locations[j,] %>% map_dbl(~sqrt(sum((dm@geometry[.x,] - dm@geometry[ground.truth[j,1],])^2))) %>% mean
  })
  
  #calculate relative precision
  pk <- d84/dsub
  
  #s1
  
  #select fluorescence data only for the submitted subset of genes
  reduced.insitu <- data.frame(dm@insitu.matrix) %>% select(genes)
  #get binarized data from distmap
  ts <- data.frame(t(dm@binarized.data))
  #select binarized data only for the submitted subset of genes
  reduced.ts <- ts %>% select(genes)
  
  #map every cell location prediction to the MCC between the ground truth location and the predicted most likely position, using the submitted subset of genes 
  mccrs <- seq(nrow(locations)) %>% map_dbl(~mccr(reduced.insitu[ground.truth[.x,1],],reduced.insitu[locations[.x,1],]))
  
  #do not take into account the cells with ambiguous locations
  s1 <- sum(((pk/sum(pk)) * mccrs)[-ambig.locations])
  
  #s2
  #do not take into account the cells with ambiguous locations
  s2<- mean(pk[-ambig.locations])
  
  #s3
  
  #comparing rnaseq and fluorescence data using true locations
  true.mccs <- seq(ncol(reduced.ts)) %>% map_dbl(~mccr(reduced.insitu[ground.truth[-ambig.locations,1],.x],reduced.ts[-ambig.locations,.x]))
  #.. using submitted locations
  competitor.mccs <- seq(ncol(reduced.ts)) %>% map_dbl(~mccr(reduced.insitu[locations[-ambig.locations,1],.x],reduced.ts[-ambig.locations,.x]))
  
  #do not take into account the cells with ambiguous locations
  s3 <- sum(((true.mccs/sum(true.mccs)) * competitor.mccs))
  
  return(c(s1,s2,s3))
}

#Scoring function with bootstraping
#Input: path to the results .csv file (character), number of subchallenge (integer), number of bootstraps (integer, optional)
#Output: data frame with scores
score.bootstrapped <- function(path,sub,nboot=1000){
  
  if (!exists("dm")) initialize()
  
  submission <- read.csv(path,header=FALSE,stringsAsFactors = FALSE)
  
  #separate the gene names from the location predictions
  gene.lines <- (4-sub)*2
  genes <- submission %>% slice(1:gene.lines)
  locations <- submission %>% slice(-1:-gene.lines)
  
  #preprocess genes and locations, remove NAs, sort locations by cellid
  genes <- genes %>% select(-1) %>% unlist %>% as.character
  locations <- locations[order(locations[,1]),] %>% select(-1) %>% apply(2,as.numeric)
  
  #fix incompatibility
  genes = gsub("-",".",genes,fixed = T)
  genes = gsub("(spl)",".spl.",genes,fixed = T)
  
  #remove ambiguous locations
  locations.n <- locations[-ambig.locations,]
  ground.truth.n <- ground.truth[-ambig.locations,]
  
  #do the same mapping for the submission as for d48
  dsub <- seq(nrow(locations.n)) %>% map_dbl(function(j){
    vals <- locations.n[j,] %>% map_dbl(~sqrt(sum((dm@geometry[.x,] - dm@geometry[ground.truth.n[j,1],])^2))) %>% mean
  })
  
  #calculate relative precision
  pk <- d84[-ambig.locations]/dsub
  
  #s1
  
  #select fluorescence data only for the submitted subset of genes
  reduced.insitu <- data.frame(dm@insitu.matrix) %>% select(genes)
  #get binarized data from distmap and remove abiguous locations
  ts <- data.frame(t(dm@binarized.data))[-ambig.locations,]
  #select binarized data only for the submitted subset of genes
  reduced.ts <- ts %>% select(genes)
  
  #map every cell location prediction to the MCC between the ground truth location and the predicted most likely position, using the submitted subset of genes 
  mccrs <- seq(nrow(locations.n)) %>% map_dbl(~mccr(reduced.insitu[ground.truth.n[.x,1],],reduced.insitu[locations.n[.x,1],]))
  
  #bootstrapping
  samples <- seq(nboot) %>% map_dfr(function(seed){
    set.seed(seed)
    bootstrap <- sample.int(nrow(locations.n),replace=TRUE)
    #s1
    s1.b <- sum((pk[bootstrap]/sum(pk[bootstrap])) * mccrs[bootstrap])
    #s2
    s2.b <- mean(pk[bootstrap])
    
    #s3
    #since we bootstrap by locations, these must be recalculated
    true.mccs.b <- seq(ncol(reduced.ts)) %>% map_dbl(~mccr(reduced.insitu[ground.truth.n[bootstrap,1],.x],reduced.ts[bootstrap,.x]))
    #submitted locations
    competitor.mccs.b <- seq(ncol(reduced.ts)) %>% map_dbl(~mccr(reduced.insitu[locations.n[bootstrap,1],.x],reduced.ts[bootstrap,.x]))
    
    #here i assumed that the denominator is the sum of true mccs
    s3.b <- sum((true.mccs.b/sum(true.mccs.b)) * competitor.mccs.b)
    
    
    data.frame(s1=s1.b,s2=s2.b,s3=s3.b)
  })
  
  return(samples)
}

#wrapper for summarising the bootstrapped scores
score.bootstrapped.summary <- function(path,sub,nboot=1000){
  score.bootstrapped(path,sub,nboot) %>% summarise(mean(s1), sd(s1), mean(s2), sd(s2), mean(s3), sd(s3)) %>% as.numeric
}

#computes the Bayes factor between two results using bootstrapped scores
bayes.bootstrap <- function(path1,path2,sub,nboot=1000){
  
  samples1 <- score.bootstrapped(path1,sub,nboot)
  samples2 <- score.bootstrapped(path2,sub,nboot)
  
  wins <- colSums(samples1 >= samples2)
  B <- wins/(nboot-wins)
  
  return(B)
}

#input: a csv file with a sid column and a team column, requires synapse login
bootstraped.ranks <- function(submissions,sub){
  
  s <- read.csv(submissions,stringsAsFactors = F)
  s$sid <- as.character(s$sid)
  
  #if not logged in synapse, load in the variable files the paths to the submissions
  files <- s$sid %>% map_chr(~synGetSubmission(.x)$filePath)
  
  #evaluate
  eval.boot <-files %>% map(~score.bootstrapped(.x,sub))
  
  #rank on each score separately and reduce to sum
  #need to make scores negative in order to properly rank them
  ranks <- seq(3) %>% map(function(score){
    ranks<- eval.boot %>% map_dfc(~-.x[,score]) %>% apply(1,rank) %>% t
    colnames(ranks) <- s$team  
    return(ranks)
  }) %>% reduce(`+`)
  
  ranks <- (ranks/3) %>% apply(1,rank) %>% t
  save(ranks,file=paste0("sc",sub,"ranks.Rdata"))
  
  #draw the boxplot
  avg.ranks <- ranks %>% colMeans %>% rank  
  ordering <- order(avg.ranks)
  pdf(paste0("sc",sub,"_final_boxplot.pdf"), width=11, height=8)
  par(mar=c(5,10.5,4,2) + 0.1)
  boxplot(ranks[,ordering],horizontal=T,las=2,at=rev(1:ncol(ranks)), xlab="Rank")
  
  factors <- map2_dfr(ordering[-length(ordering)],ordering[-1],function(c1,c2){
    win <- sum(ranks[,c1] < ranks[,c2])
    lose <- sum(ranks[,c2] < ranks[,c1])
    BF <- win/lose
    data.frame(c1=c1,c2=c2,BF=BF)
  })
  
  abh <- ncol(ranks) - factors$c1[which(factors$BF>=3)[1]] + 0.5
  abline(h=abh,lwd=2)
  
  dev.off()
  
  #report final ranking
  result <- mutate(s, rank=avg.ranks) %>% arrange(rank)
  write.csv(result,file=paste0("sc",sub,"_final_table.csv"),row.names=F)
  
  return(mutate(s, rank=avg.ranks) %>% arrange(rank))
}


In [5]:
mylist <- list()

idx = 1
for ( sc in c(1,2,3) )
    for (mt in c('direct','indirect'))
        for (fs in c('variance','ndfs','mcfs'))
            {
            filepath = paste('result/sc', sc, '_', mt, '_', fs,'.csv',sep = '')
            print(filepath)
            zz = score(filepath,sc)
            print(zz)
            mylist[[idx]] <- c(filepath,t(zz))
            idx = idx + 1
        }

df <- do.call("rbind",mylist) 
colnames(df) <- c('filename', 'score1', 'score2', 'score3')
write.csv(df, "result_no_boostrap.csv", row.names = FALSE)
df

[1] "result/sc1_direct_variance.csv"


Note: Using an external vector in selections is ambiguous.
i Use `all_of(genes)` instead of `genes` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.


[1] 0.7759676 3.0637105 0.6178299
[1] "result/sc1_direct_ndfs.csv"
[1] 0.7673072 2.8692227 0.6188529
[1] "result/sc1_direct_mcfs.csv"
[1] 0.7483966 2.2721754 0.6149195
[1] "result/sc1_indirect_variance.csv"
[1] 0.5756041 1.4738038 0.4629017
[1] "result/sc1_indirect_ndfs.csv"
[1] 0.5804928 1.4444996 0.4708535
[1] "result/sc1_indirect_mcfs.csv"
[1] 0.6005294 1.4041676 0.4796624
[1] "result/sc2_direct_variance.csv"
[1] 0.7473617 2.0109448 0.6642984
[1] "result/sc2_direct_ndfs.csv"
[1] 0.7056568 1.6311321 0.6976080
[1] "result/sc2_direct_mcfs.csv"
[1] 0.6877590 1.5133258 0.6542699
[1] "result/sc2_indirect_variance.csv"
[1] 0.5274171 1.1704136 0.4579261
[1] "result/sc2_indirect_ndfs.csv"
[1] 0.5954217 1.2334418 0.5344893
[1] "result/sc2_indirect_mcfs.csv"
[1] 0.5507527 1.0540712 0.4694800
[1] "result/sc3_direct_variance.csv"
[1] 0.6915142 1.2285287 0.7909811
[1] "result/sc3_direct_ndfs.csv"
[1] 0.6751146 1.0465283 0.7855393
[1] "result/sc3_direct_mcfs.csv"
[1] 0.5923099 0.7580489 0.7169723


filename,score1,score2,score3
result/sc1_direct_variance.csv,0.775967620710635,3.06371048910158,0.617829882931241
result/sc1_direct_ndfs.csv,0.767307202182195,2.86922266106953,0.618852919199016
result/sc1_direct_mcfs.csv,0.748396596730157,2.27217541258191,0.614919535632311
result/sc1_indirect_variance.csv,0.575604127141242,1.47380378220889,0.462901673242202
result/sc1_indirect_ndfs.csv,0.58049280960212,1.44449956410753,0.470853481084604
result/sc1_indirect_mcfs.csv,0.600529386773452,1.40416757929284,0.479662440910638
result/sc2_direct_variance.csv,0.747361706724322,2.01094479669411,0.664298382639925
result/sc2_direct_ndfs.csv,0.705656844310317,1.63113209199578,0.697608036217779
result/sc2_direct_mcfs.csv,0.687759009084475,1.51332580745647,0.654269899330788
result/sc2_indirect_variance.csv,0.52741707975278,1.17041364290453,0.457926118050043


In [6]:
mylist <- list()

idx = 1
for ( sc in c(1,2,3) )
    for (mt in c('direct','indirect'))
        for (fs in c('variance','ndfs','mcfs'))
            {
            filepath = paste('result/sc', sc, '_', mt, '_', fs,'.csv',sep = '')
            print(filepath)
            zz = score.bootstrapped.summary(filepath,sc)
            print(zz)
            mylist[[idx]] <- c(filepath,t(zz))
            idx = idx + 1
        }

df <- do.call("rbind",mylist)
colnames(df) <- c('filename', 'score1', 'std1', 'score2', 'std2', 'score3', 'std3')
write.csv(df, "result_boostrap.csv", row.names = FALSE)
df

[1] "result/sc1_direct_variance.csv"
[1] 0.953658534 0.003187285 3.069521683 0.104077971 0.618563444 0.003750978
[1] "result/sc1_direct_ndfs.csv"
[1] 0.942135404 0.003556940 2.867463749 0.098185042 0.619506785 0.003810686
[1] "result/sc1_direct_mcfs.csv"
[1] 0.921263151 0.004794512 2.273684931 0.094568860 0.616184266 0.004520454
[1] "result/sc1_indirect_variance.csv"
[1] 0.725844391 0.008365821 1.474714079 0.047742341 0.463495188 0.005443894
[1] "result/sc1_indirect_ndfs.csv"
[1] 0.727817050 0.006863704 1.443847150 0.039715068 0.471518340 0.005596722
[1] "result/sc1_indirect_mcfs.csv"
[1] 0.759778744 0.009376668 1.402807490 0.055700511 0.480778491 0.006231210
[1] "result/sc2_direct_variance.csv"
[1] 0.902498947 0.006084409 2.011914969 0.090091781 0.664715472 0.003482337
[1] "result/sc2_direct_ndfs.csv"
[1] 0.881356391 0.006613658 1.631056012 0.076395515 0.697902156 0.004089665
[1] "result/sc2_direct_mcfs.csv"
[1] 0.864112005 0.006915663 1.515802549 0.072362304 0.655071827 0.005524278
[

filename,score1,std1,score2,std2,score3,std3
result/sc1_direct_variance.csv,0.953658533832665,0.0031872853950667,3.06952168287307,0.104077971008099,0.618563444417309,0.0037509783725228
result/sc1_direct_ndfs.csv,0.942135403639612,0.0035569398039616,2.86746374914652,0.0981850423855946,0.619506785063875,0.0038106861883965
result/sc1_direct_mcfs.csv,0.921263150845092,0.0047945118845101,2.27368493136849,0.0945688599324735,0.616184265679547,0.0045204538136314
result/sc1_indirect_variance.csv,0.725844391246788,0.0083658205244943,1.47471407886254,0.0477423413033079,0.463495188201846,0.0054438942878971
result/sc1_indirect_ndfs.csv,0.727817049526372,0.0068637041103877,1.44384714958943,0.0397150676938629,0.471518339838633,0.0055967215084458
result/sc1_indirect_mcfs.csv,0.759778743510364,0.0093766679091175,1.40280748998638,0.0557005106497099,0.480778490687756,0.006231209528312
result/sc2_direct_variance.csv,0.902498946651435,0.0060844090735539,2.01191496949287,0.0900917809462589,0.664715471614672,0.0034823367655514
result/sc2_direct_ndfs.csv,0.881356391483041,0.0066136581724364,1.63105601182157,0.0763955149058557,0.69790215604845,0.0040896649016838
result/sc2_direct_mcfs.csv,0.864112005179998,0.0069156626529461,1.51580254936282,0.0723623035070967,0.655071826560043,0.0055242784997252
result/sc2_indirect_variance.csv,0.6593138581969,0.0096265125482206,1.16837365034699,0.0467081094790096,0.458116629958327,0.0059144546432979
