fixing export selected, #27379, and scanpy import problems reported b…

…y pablo
ucscGenomeBrowser · Apr 30, 2021 · f9506e1 · f9506e1
1 parent f83fd2b
commit f9506e1
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 14 deletions.
diff --git a/src/cbPyLib/cellbrowser/cbWeb/js/cbData.js b/src/cbPyLib/cellbrowser/cbWeb/js/cbData.js
@@ -963,6 +963,14 @@ function CbDbFile(url) {
         return self.geneOffsets;
     };
 
+    this.getCellIdMeta = function() {
+    /* return the cell ID meta field */
+        for (let metaInfo of self.conf.metaFields)
+            if (!metaInfo.isCustom)
+                return metaInfo;
+    }
+
+
     this.loadCellIds = function(idxArray, onDone, onProgress) {
         /* Get the cellId strings, the first meta field, for the integer IDs of cells in idxArray.
          * Calls onDone with an array of strings.
@@ -989,6 +997,7 @@ function CbDbFile(url) {
             onDone(mapIdxToId(idxArray));
         }
 
+        // start of loadCellIds
         if (self.cellIds===undefined) {
             // if we haven't loaded them yet, trigger the cellId load
             let cellIdMeta = self.getCellIdMeta();

diff --git a/src/cbPyLib/cellbrowser/cellbrowser.py b/src/cbPyLib/cellbrowser/cellbrowser.py
@@ -4091,6 +4091,8 @@ def anndataMatrixToTsv(ad, matFname, usePandas=False, useRaw=False):
         mat = mat.tocsr() # makes writing to a file ten times faster, thanks Alex Wolf!
 
     if usePandas:
+        # This code is currently not used anywhere by this code. It's still there
+        # so external code that calls this function can use it, if the code below does not work.
         logging.info("Converting anndata to pandas dataframe")
         data_matrix=pd.DataFrame(mat, index=var.index.tolist(), columns=ad.obs.index.tolist())
         logging.info("Writing pandas dataframe to file (slow?)")
@@ -4153,7 +4155,7 @@ def makeDictDefaults(inVar, defaults):
 
 def runSafeRankGenesGroups(adata, clusterField, minCells=5):
     " run scanpy's rank_genes_groups in a way that hopefully doesn't crash "
-    importScanpy()
+    sc = importScanpy()
 
     adata.obs[clusterField] = adata.obs[clusterField].astype("category") # if not category, rank_genes will crash
     sc.pp.filter_genes(adata, min_cells=minCells) # rank_genes_groups crashes on zero-value genes
@@ -4787,7 +4789,7 @@ def cbBuildCli():
 
 def readMatrixAnndata(matrixFname, samplesOnRows=False, genome="hg38"):
     " read an expression matrix and return an adata object. Supports .mtx, .h5 and .tsv (not .tsv.gz) "
-    importScanpy()
+    sc = importScanpy()
 
     if matrixFname.endswith(".mtx.gz"):
         errAbort("For cellranger3-style .mtx files, please specify the directory, not the .mtx.gz file name")
@@ -5368,7 +5370,7 @@ def getObsmKeys(adata):
 def cbScanpy(matrixFname, inMeta, inCluster, confFname, figDir, logFname):
     """ run expr matrix through scanpy, output a cellbrowser.conf, a matrix and the meta data.
     Return an adata object. Optionally keeps a copy of the raw matrix in adata.raw """
-    importScanpy()
+    sc = importScanpy()
 
     import pandas as pd
     import numpy as np
@@ -5391,7 +5393,8 @@ def cbScanpy(matrixFname, inMeta, inCluster, confFname, figDir, logFname):
     sys.stdout = Tee(logFname) # we want our log messages and also scanpy messages into one file
 
     pipeLog("cbScanpy $Id$")
-    pipeLog("Input file: %s" % matrixFname)
+    pipeLog("Command: %s" % " ".join(sys.argv))
+    pipeLog("Matrix input file: %s" % matrixFname)
 
     pipeLog("Restricting OPENBLAS to 4 threads")
     os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4 
@@ -5812,6 +5815,7 @@ def importScanpy():
         print("$ conda install -c conda-forge python-igraph leiden")
         print("Then re-run this command.")
         sys.exit(1)
+    return sc
 
 def cbScanpyCli():
     " command line interface for cbScanpy "

diff --git a/src/cbPyLib/cellbrowser/genes.py b/src/cbPyLib/cellbrowser/genes.py
@@ -41,7 +41,7 @@ def cbGenes_parseArgs():
     parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
     (options, args) = parser.parse_args()
 
-    if args==[]:
+    if args==[] or (args[0]=="guess" and len(args)==1):
         parser.print_help()
         exit(1)
 
@@ -61,18 +61,27 @@ def parseSignatures(org, geneIdType):
         if line.startswith("#"):
             continue
         version, geneIds = line.rstrip("\n").split('\t')
-        geneIds = geneIds.split("|")
+        geneIds = set(geneIds.split("|"))
         verToGenes[version] = geneIds
+
     return verToGenes
 
 def guessGeneIdType(genes):
     " return tuple organism / identifier type "
+    logging.debug("Trying to guess organism and identifier type (syms or ids)")
     gene1 = list(genes)[0]
     if gene1.startswith("ENSG"):
         return "human", "ids"
     if gene1.startswith("ENSMUS"):
         return "mouse", "ids"
-    if gene1.upper()==gene1:
+
+    upCount = 0
+    for g in genes:
+        if g.isupper():
+            upCount += 1
+
+    logging.debug("%d of %d genes are uppercase" % (upCount, len(genes)))
+    if upCount/float(len(genes)) > 0.8:
         return "human", "syms"
     else:
         return "mouse", "syms"
@@ -96,18 +105,28 @@ def parseGenes(fname):
 
 def guessGencodeVersion(fileGenes, signGenes):
     logging.info("Number of genes that are only in a gene model release:")
-    diffs = []
+    #diffs = []
+    infos = []
     for version, uniqGenes in signGenes.items():
         intersection = list(fileGenes.intersection(uniqGenes))
-        infoStr = "release "+version+": %d out of %d" % (len(intersection), len(uniqGenes))
+        share = 100.0 * (float(len(intersection)) / len(uniqGenes))
+        intLen = len(intersection)
+        geneCount = len(uniqGenes)
+        infoStr = "release "+version+": %0.2f%%, %d out of %d" % (share, len(intersection), len(uniqGenes))
         if len(intersection)!=0:
             expStr = ", ".join(intersection[:5])
             infoStr += (" e.g. "+ expStr)
-        logging.info(infoStr)
-        diffs.append((len(intersection), version))
+        #logging.info(infoStr)
+        infos.append((share, version, intLen, geneCount, infoStr))
+
+        #diffs.append((len(intersection), version))
+
+    infos.sort(reverse=True)
+    bestVersion = infos[0][1]
 
-    diffs.sort(reverse=True)
-    bestVersion = diffs[0][1]
+    for info in infos:
+        share, version, intLen, geneCount, infoStr = info
+        print(infoStr)
 
     return bestVersion
 
@@ -122,7 +141,9 @@ def guessGencode(fname, org):
     print("Best %s Gencode release\t%s" % (org, bestVersion))
 
     allIds = readGeneSymbols(bestVersion)
-    notFoundIds = set(allIds) - inGenes
+    if geneType=="syms":
+        allIds = allIds.values()
+    notFoundIds = inGenes - set(allIds)
     print("%d of the genes in the input are not part of %s" % (len(notFoundIds), bestVersion))
     print("Examples: %s" % " ".join(list(notFoundIds)[:50]))
 
@@ -354,6 +375,17 @@ def uniqueIds(org):
         allSyms[geneType] = syms
         allIds[geneType] = ids
 
+    # force refseq into this
+    syms = []
+    fname = getStaticFile("genes/entrez-%s.symbols.tsv.gz" % (org))
+    for line in openFile(fname):
+        if line.startswith("#"):
+            continue
+        geneId, sym = line.rstrip("\n").split("\t")
+        syms.append(sym)
+    #verToGenes["refseq"] = set(syms)
+    allSyms["entrez"] = set(syms)
+
     logging.info("Finding unique values")
     uniqSyms, commonSyms = keepOnlyUnique(allSyms)
     uniqIds, commonIds = keepOnlyUnique(allIds)