Skip to content

Commit

Permalink
fixing export selected, #27379, and scanpy import problems reported b…
Browse files Browse the repository at this point in the history
…y pablo
  • Loading branch information
maximilianh committed Apr 30, 2021
1 parent f83fd2b commit f9506e1
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 14 deletions.
9 changes: 9 additions & 0 deletions src/cbPyLib/cellbrowser/cbWeb/js/cbData.js
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,14 @@ function CbDbFile(url) {
return self.geneOffsets;
};

this.getCellIdMeta = function() {
/* return the cell ID meta field */
for (let metaInfo of self.conf.metaFields)
if (!metaInfo.isCustom)
return metaInfo;
}


this.loadCellIds = function(idxArray, onDone, onProgress) {
/* Get the cellId strings, the first meta field, for the integer IDs of cells in idxArray.
* Calls onDone with an array of strings.
Expand All @@ -989,6 +997,7 @@ function CbDbFile(url) {
onDone(mapIdxToId(idxArray));
}

// start of loadCellIds
if (self.cellIds===undefined) {
// if we haven't loaded them yet, trigger the cellId load
let cellIdMeta = self.getCellIdMeta();
Expand Down
12 changes: 8 additions & 4 deletions src/cbPyLib/cellbrowser/cellbrowser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4091,6 +4091,8 @@ def anndataMatrixToTsv(ad, matFname, usePandas=False, useRaw=False):
mat = mat.tocsr() # makes writing to a file ten times faster, thanks Alex Wolf!

if usePandas:
# This code is currently not used anywhere by this code. It's still there
# so external code that calls this function can use it, if the code below does not work.
logging.info("Converting anndata to pandas dataframe")
data_matrix=pd.DataFrame(mat, index=var.index.tolist(), columns=ad.obs.index.tolist())
logging.info("Writing pandas dataframe to file (slow?)")
Expand Down Expand Up @@ -4153,7 +4155,7 @@ def makeDictDefaults(inVar, defaults):

def runSafeRankGenesGroups(adata, clusterField, minCells=5):
" run scanpy's rank_genes_groups in a way that hopefully doesn't crash "
importScanpy()
sc = importScanpy()

adata.obs[clusterField] = adata.obs[clusterField].astype("category") # if not category, rank_genes will crash
sc.pp.filter_genes(adata, min_cells=minCells) # rank_genes_groups crashes on zero-value genes
Expand Down Expand Up @@ -4787,7 +4789,7 @@ def cbBuildCli():

def readMatrixAnndata(matrixFname, samplesOnRows=False, genome="hg38"):
" read an expression matrix and return an adata object. Supports .mtx, .h5 and .tsv (not .tsv.gz) "
importScanpy()
sc = importScanpy()

if matrixFname.endswith(".mtx.gz"):
errAbort("For cellranger3-style .mtx files, please specify the directory, not the .mtx.gz file name")
Expand Down Expand Up @@ -5368,7 +5370,7 @@ def getObsmKeys(adata):
def cbScanpy(matrixFname, inMeta, inCluster, confFname, figDir, logFname):
""" run expr matrix through scanpy, output a cellbrowser.conf, a matrix and the meta data.
Return an adata object. Optionally keeps a copy of the raw matrix in adata.raw """
importScanpy()
sc = importScanpy()

import pandas as pd
import numpy as np
Expand All @@ -5391,7 +5393,8 @@ def cbScanpy(matrixFname, inMeta, inCluster, confFname, figDir, logFname):
sys.stdout = Tee(logFname) # we want our log messages and also scanpy messages into one file

pipeLog("cbScanpy $Id$")
pipeLog("Input file: %s" % matrixFname)
pipeLog("Command: %s" % " ".join(sys.argv))
pipeLog("Matrix input file: %s" % matrixFname)

pipeLog("Restricting OPENBLAS to 4 threads")
os.environ["OPENBLAS_NUM_THREADS"] = "4" # export OPENBLAS_NUM_THREADS=4
Expand Down Expand Up @@ -5812,6 +5815,7 @@ def importScanpy():
print("$ conda install -c conda-forge python-igraph leiden")
print("Then re-run this command.")
sys.exit(1)
return sc

def cbScanpyCli():
" command line interface for cbScanpy "
Expand Down
52 changes: 42 additions & 10 deletions src/cbPyLib/cellbrowser/genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def cbGenes_parseArgs():
parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
(options, args) = parser.parse_args()

if args==[]:
if args==[] or (args[0]=="guess" and len(args)==1):
parser.print_help()
exit(1)

Expand All @@ -61,18 +61,27 @@ def parseSignatures(org, geneIdType):
if line.startswith("#"):
continue
version, geneIds = line.rstrip("\n").split('\t')
geneIds = geneIds.split("|")
geneIds = set(geneIds.split("|"))
verToGenes[version] = geneIds

return verToGenes

def guessGeneIdType(genes):
" return tuple organism / identifier type "
logging.debug("Trying to guess organism and identifier type (syms or ids)")
gene1 = list(genes)[0]
if gene1.startswith("ENSG"):
return "human", "ids"
if gene1.startswith("ENSMUS"):
return "mouse", "ids"
if gene1.upper()==gene1:

upCount = 0
for g in genes:
if g.isupper():
upCount += 1

logging.debug("%d of %d genes are uppercase" % (upCount, len(genes)))
if upCount/float(len(genes)) > 0.8:
return "human", "syms"
else:
return "mouse", "syms"
Expand All @@ -96,18 +105,28 @@ def parseGenes(fname):

def guessGencodeVersion(fileGenes, signGenes):
logging.info("Number of genes that are only in a gene model release:")
diffs = []
#diffs = []
infos = []
for version, uniqGenes in signGenes.items():
intersection = list(fileGenes.intersection(uniqGenes))
infoStr = "release "+version+": %d out of %d" % (len(intersection), len(uniqGenes))
share = 100.0 * (float(len(intersection)) / len(uniqGenes))
intLen = len(intersection)
geneCount = len(uniqGenes)
infoStr = "release "+version+": %0.2f%%, %d out of %d" % (share, len(intersection), len(uniqGenes))
if len(intersection)!=0:
expStr = ", ".join(intersection[:5])
infoStr += (" e.g. "+ expStr)
logging.info(infoStr)
diffs.append((len(intersection), version))
#logging.info(infoStr)
infos.append((share, version, intLen, geneCount, infoStr))

#diffs.append((len(intersection), version))

infos.sort(reverse=True)
bestVersion = infos[0][1]

diffs.sort(reverse=True)
bestVersion = diffs[0][1]
for info in infos:
share, version, intLen, geneCount, infoStr = info
print(infoStr)

return bestVersion

Expand All @@ -122,7 +141,9 @@ def guessGencode(fname, org):
print("Best %s Gencode release\t%s" % (org, bestVersion))

allIds = readGeneSymbols(bestVersion)
notFoundIds = set(allIds) - inGenes
if geneType=="syms":
allIds = allIds.values()
notFoundIds = inGenes - set(allIds)
print("%d of the genes in the input are not part of %s" % (len(notFoundIds), bestVersion))
print("Examples: %s" % " ".join(list(notFoundIds)[:50]))

Expand Down Expand Up @@ -354,6 +375,17 @@ def uniqueIds(org):
allSyms[geneType] = syms
allIds[geneType] = ids

# force refseq into this
syms = []
fname = getStaticFile("genes/entrez-%s.symbols.tsv.gz" % (org))
for line in openFile(fname):
if line.startswith("#"):
continue
geneId, sym = line.rstrip("\n").split("\t")
syms.append(sym)
#verToGenes["refseq"] = set(syms)
allSyms["entrez"] = set(syms)

logging.info("Finding unique values")
uniqSyms, commonSyms = keepOnlyUnique(allSyms)
uniqIds, commonIds = keepOnlyUnique(allIds)
Expand Down

0 comments on commit f9506e1

Please sign in to comment.