few changes and bugs. version up

rbpisupati · rbpisupati · commit 01df5ecba00a · 2018-01-26T16:08:59.000+01:00
diff --git a/README.md b/README.md
@@ -105,6 +105,7 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
 - 1.7.2: Stable version, 15-12-2016
 - 1.8.2: Stable version, 16-02-2017
 - 1.9.2: Stable version, 24-08-2017
+- 2.0.0: Stable version, 26-01-2018
 
 
 ## Credits
@@ -116,4 +117,3 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
 
 Pisupati, R. *et al.*. Verification of *Arabidopsis* stock collections using SNPmatch, a tool for genotyping high-plexed samples.  *Nature Scientific Data*  **4**, 170184 (2017).
 [doi:10.1038/sdata.2017.184](https://www.nature.com/articles/sdata2017184)
-
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 setup(
     name='SNPmatch',
-    version='1.9.2',
+    version='2.0.0',
     description='A simple python library to identify the most likely strain given the SNPs for a sample',
     long_description=long_description,
     url='https://github.com/Gregor-Mendel-Institute/SNPmatch',
diff --git a/snpmatch/__init__.py b/snpmatch/__init__.py
@@ -15,8 +15,8 @@
 from snpmatch.core import simulate
 import logging, logging.config
 
-__version__ = '1.9.2'
-__updated__ = "31.8.2017"
+__version__ = '2.0.0'
+__updated__ = "26.01.2018"
 __date__ = "25.10.2016"
 
 def setLog(logDebug):
@@ -47,6 +47,7 @@ def get_options(program_license,program_version_message):
   inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
   inbred_parser.set_defaults(func=snpmatch_inbred)
+
   cross_parser = subparsers.add_parser('cross', help="SNPmatch on the crosses (F2s and F3s) of A. thaliana")
   cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
   cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
@@ -55,6 +56,7 @@ def get_options(program_license,program_version_message):
   cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
   cross_parser.set_defaults(func=snpmatch_cross)
+
   genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
   genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
   genocross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -69,18 +71,21 @@ def get_options(program_license,program_version_message):
   parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   parser.add_argument("-o", "--output", dest="outFile", help="output + .npz file is generater required for SNPmatch")
   parser.set_defaults(func=snpmatch_parser)
+
   pairparser = subparsers.add_parser('pairsnp', help="pairwise comparison of two snp files")
   pairparser.add_argument("-i", "--input_file_1", dest="inFile_1", help="VCF/BED file for the variants in the sample one")
   pairparser.add_argument("-j", "--input_file_2", dest="inFile_2", help="VCF/BED file for the variants in the sample two")
   pairparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   pairparser.add_argument("-o", "--output", dest="outFile", help="output json file")
   pairparser.set_defaults(func=snpmatch_paircomparions)
+
   makedbparser = subparsers.add_parser('makedb', help="Create database files from given VCF, only give biallelic SNPs")
-  makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains.")
+  makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains. You can also provide a CSV file which is an intermediate file in the process.")
   makedbparser.add_argument("-p", "--bcftools_path", dest="bcfpath", help="path to the bcftools executable. Not necessary if present in BASH PATH", default='')
   makedbparser.add_argument("-o", "--out_db_id", dest="db_id", help="output id for database files")
   makedbparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
   makedbparser.set_defaults(func=makedb_vcf_to_hdf5)
+
   simparser = subparsers.add_parser('simulate', help="Given SNP database, check the genotyping efficiency randomly selecting 'n' number of SNPs")
   simparser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
   simparser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -122,7 +127,8 @@ def snpmatch_parser(args):
   if not args['outFile']:
     if os.path.isfile(args['inFile'] + ".snpmatch.npz"):
       os.remove(args['inFile'] + ".snpmatch.npz")
-  snpmatch.parseInput(inFile = args['inFile'], logDebug =  args['logDebug'], outFile = args['outFile'])
+  from snpmatch.core import parsers
+  parsers.parseInput(inFile = args['inFile'], logDebug =  args['logDebug'], outFile = args['outFile'])
 
 def genotype_cross(args):
   #checkARGs(args)
diff --git a/snpmatch/core/csmatch.py b/snpmatch/core/csmatch.py
@@ -10,6 +10,7 @@
 import os
 import StringIO
 import snpmatch
+import parsers
 import json
 import itertools
 
@@ -92,10 +93,8 @@ def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
       tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
       tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
       ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
-      if(len(TarGTs0[j:j+chunk_size]) > 1):
+      if(len(TarGTs0[j:j+chunk_size]) >= 1):
         NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
-      elif(len(TarGTs0[j:j+chunk_size]) == 1):
-        NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
     TotScoreList = TotScoreList + ScoreList
     TotNumInfoSites = TotNumInfoSites + NumInfoSites
     writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites)
@@ -197,7 +196,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
     score = 0
     numinfo = 0
     NumMatSNPs = 0
-    for ind,echr in enumerate(snpmatch.parseChrName(GenotypeData.chrs)[0]):
+    for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]):
       perchrTarPos = np.where(snpCHR == echr)[0]
       perchrtarSNPpos = snpPOS[perchrTarPos]
       start = GenotypeData.chr_regions[ind][0]
@@ -223,7 +222,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
   crossInterpreter(GenotypeData, binLen, outID)
 
 def potatoCrossIdentifier(args):
-  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
+  (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
   log.info("loading genotype files!")
   GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
   GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
@@ -252,7 +251,7 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706):
 
 def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True):
     ## inFile are the SNPs of the sample
-    (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = inFile, logDebug = logDebug)
+    (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug)
     # identifying the segregating SNPs between the accessions
     # only selecting 0 or 1
     segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
@@ -272,7 +271,7 @@ def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, b
       matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
       matchedTarGTs = snpGT[matchedTarInd]
       try:
-        TarGTBinary = snpmatch.parseGT(matchedTarGTs)
+        TarGTBinary = parsers.parseGT(matchedTarGTs)
         TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4
         genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd])
         genP1no = len(np.where(genP1 == 0)[0])
@@ -299,8 +298,8 @@ def crossGenotyper(args):
         log.info("input files: %s and %s" % (args['parents'], args['father']))
         if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
             die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
-        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
-        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
+        (p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
+        (p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug'])
         commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
         commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
         commonSNPsPOS = np.zeros(0, dtype=int)
@@ -316,8 +315,8 @@ def crossGenotyper(args):
             perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
             perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
             perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
-            snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
-            snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
+            snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds]))
+            snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds]))
         log.info("done!")
     else:
         parents = args['parents']
diff --git a/snpmatch/core/makedb.py b/snpmatch/core/makedb.py
@@ -89,7 +89,7 @@ def makedb_from_vcf(args):
         log.info('done!')
     elif inType == '.csv':
         log.info("converting CSV to hdf5!")
-        makeHDF5s(args['db_id'] + '.csv', args['db_id'])
+        makeHDF5s(args['inFile'], args['db_id'])
         log.info('done!')
     else:
         die("please provide either a VCF file or a CSV!")
diff --git a/snpmatch/core/parsers.py b/snpmatch/core/parsers.py
@@ -0,0 +1,125 @@
+import pandas as pd
+import numpy as np
+import allel
+import snpmatch
+import logging
+import os
+import json
+
+log = logging.getLogger(__name__)
+
+def parseGT(snpGT):
+    first = snpGT[0]
+    snpBinary = np.zeros(len(snpGT), dtype = "int8")
+    if first.find('|') != -1:
+        ## GT is phased
+        separator = "|"
+    elif first.find('/') != -1:
+        ## GT is not phased
+        separator = "/"
+    elif np.char.isdigit(first):
+        return np.array(np.copy(snpGT), dtype = "int8")
+    else:
+        snpmatch.die("unable to parse the format of GT in vcf!")
+    hetGT = "0" + separator + "1"
+    refGT = "0" + separator + "0"
+    altGT = "1" + separator + "1"
+    nocall = "." + separator + "."
+    snpBinary[np.where(snpGT == altGT)[0]] = 1
+    snpBinary[np.where(snpGT == hetGT)[0]] = 2
+    snpBinary[np.where(snpGT == nocall)[0]] = -1
+    return snpBinary
+
+def parseChrName(targetCHR):
+    snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "")
+    snpsREQ = np.where(np.char.isdigit(snpCHROM))[0]   ## Filtering positions from mitochondrial and chloroplast
+    snpCHR = snpCHROM[snpsREQ]
+    return (snpCHR, snpsREQ)
+
+def readBED(inFile, logDebug):
+    log.info("reading the position file")
+    targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2])
+    (snpCHR, snpsREQ) = parseChrName(targetSNPs[0])
+    snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ]
+    snpGT = np.array(targetSNPs[2])[snpsREQ]
+    snpBinary = parseGT(snpGT)
+    snpWEI = np.ones((len(snpCHR), 3))  ## for homo and het
+    snpWEI[np.where(snpBinary != 0),0] = 0
+    snpWEI[np.where(snpBinary != 1),2] = 0
+    snpWEI[np.where(snpBinary != 2),1] = 0
+    return (snpCHR, snpPOS, snpGT, snpWEI)
+
+def readVcf(inFile, logDebug):
+    log.info("reading the VCF file")
+    ## We read only one sample from the VCF file
+    if logDebug:
+        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
+    else:
+        import StringIO
+        import sys
+        sys.stderr = StringIO.StringIO()
+        vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
+        #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
+        #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
+        sys.stderr = sys.__stderr__
+    (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
+    try:
+        snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
+    except AttributeError:
+        snpmatch.die("input VCF file doesnt have required GT field")
+    snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
+    snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
+    if 'calldata/PL' in sorted(vcf.keys()):
+        snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
+        snpWEI = snpWEI/(-10)
+        snpWEI = np.exp(snpWEI)
+
+    else:
+        snpBinary = parseGT(snpGT)
+        snpWEI = np.ones((len(snpsREQ), 3))  ## for homo and het
+        snpWEI[np.where(snpBinary != 0),0] = 0
+        snpWEI[np.where(snpBinary != 1),2] = 0
+        snpWEI[np.where(snpBinary != 2),1] = 0
+    snpCHR = snpCHR[snpsREQ]
+    DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0])
+    snpPOS = np.array(vcf['variants/POS'][snpsREQ])
+    return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
+
+def parseInput(inFile, logDebug, outFile = "parser"):
+    if outFile == "parser" or not outFile:
+        outFile = inFile + ".snpmatch"
+    if os.path.isfile(inFile + ".snpmatch.npz"):
+        log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz")
+        snps = np.load(inFile + ".snpmatch.npz")
+        (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
+    else:
+        _,inType = os.path.splitext(inFile)
+        if inType == '.npz':
+            log.info("loading snpmatch parser file! %s", inFile)
+            snps = np.load(inFile)
+            (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
+        else:
+            log.info('running snpmatch parser!')
+            if inType == '.vcf':
+                (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug)
+            elif inType == '.bed':
+                (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug)
+                DPmean = "NA"
+            else:
+                snpmatch.die("input file type %s not supported" % inType)
+            log.info("creating snpmatch parser file: %s", outFile + '.npz')
+            np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean)
+            NumSNPs = len(snpCHR)
+            case = 0
+            note = "Sufficient number of SNPs"
+            if NumSNPs < snpmatch.snp_thres:
+                note = "Attention: low number of SNPs provided"
+                case = 1
+            snpst = np.unique(snpCHR, return_counts=True)
+            snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0])))
+            statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean}
+            statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT)
+            with open(outFile + ".stats.json", "w") as out_stats:
+                out_stats.write(json.dumps(statdict))
+    log.info("done!")
+    return (snpCHR, snpPOS, snpGT, snpWEI, DPmean)
diff --git a/snpmatch/core/snpmatch.py b/snpmatch/core/snpmatch.py