Skip to content

Commit 01df5ec

Browse files
committed
few changes and bugs. version up
1 parent 31d4241 commit 01df5ec

File tree

7 files changed

+169
-156
lines changed

7 files changed

+169
-156
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
105105
- 1.7.2: Stable version, 15-12-2016
106106
- 1.8.2: Stable version, 16-02-2017
107107
- 1.9.2: Stable version, 24-08-2017
108+
- 2.0.0: Stable version, 26-01-2018
108109

109110

110111
## Credits
@@ -116,4 +117,3 @@ These scripts are implemented based on the *A. thaliana* genome sizes. But the g
116117

117118
Pisupati, R. *et al.*. Verification of *Arabidopsis* stock collections using SNPmatch, a tool for genotyping high-plexed samples. *Nature Scientific Data* **4**, 170184 (2017).
118119
[doi:10.1038/sdata.2017.184](https://www.nature.com/articles/sdata2017184)
119-

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
setup(
1212
name='SNPmatch',
13-
version='1.9.2',
13+
version='2.0.0',
1414
description='A simple python library to identify the most likely strain given the SNPs for a sample',
1515
long_description=long_description,
1616
url='https://github.com/Gregor-Mendel-Institute/SNPmatch',

snpmatch/__init__.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
from snpmatch.core import simulate
1616
import logging, logging.config
1717

18-
__version__ = '1.9.2'
19-
__updated__ = "31.8.2017"
18+
__version__ = '2.0.0'
19+
__updated__ = "26.01.2018"
2020
__date__ = "25.10.2016"
2121

2222
def setLog(logDebug):
@@ -47,6 +47,7 @@ def get_options(program_license,program_version_message):
4747
inbred_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
4848
inbred_parser.add_argument("-o", "--output", dest="outFile", help="Output file with the probability scores")
4949
inbred_parser.set_defaults(func=snpmatch_inbred)
50+
5051
cross_parser = subparsers.add_parser('cross', help="SNPmatch on the crosses (F2s and F3s) of A. thaliana")
5152
cross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF/BED file for the variants in the sample")
5253
cross_parser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
@@ -55,6 +56,7 @@ def get_options(program_license,program_version_message):
5556
cross_parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
5657
cross_parser.add_argument("-o", "--output", dest="outFile", help="Output files with the probability scores and scores along windows")
5758
cross_parser.set_defaults(func=snpmatch_cross)
59+
5860
genocross_parser = subparsers.add_parser('genotype_cross', help="Genotype the crosses by windows given parents")
5961
genocross_parser.add_argument("-i", "--input_file", dest="inFile", help="VCF file for the variants in the sample")
6062
genocross_parser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -69,18 +71,21 @@ def get_options(program_license,program_version_message):
6971
parser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
7072
parser.add_argument("-o", "--output", dest="outFile", help="output + .npz file is generater required for SNPmatch")
7173
parser.set_defaults(func=snpmatch_parser)
74+
7275
pairparser = subparsers.add_parser('pairsnp', help="pairwise comparison of two snp files")
7376
pairparser.add_argument("-i", "--input_file_1", dest="inFile_1", help="VCF/BED file for the variants in the sample one")
7477
pairparser.add_argument("-j", "--input_file_2", dest="inFile_2", help="VCF/BED file for the variants in the sample two")
7578
pairparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
7679
pairparser.add_argument("-o", "--output", dest="outFile", help="output json file")
7780
pairparser.set_defaults(func=snpmatch_paircomparions)
81+
7882
makedbparser = subparsers.add_parser('makedb', help="Create database files from given VCF, only give biallelic SNPs")
79-
makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains.")
83+
makedbparser.add_argument("-i", "--input_vcf", dest="inFile", help="input VCF file for the known strains. You can also provide a CSV file which is an intermediate file in the process.")
8084
makedbparser.add_argument("-p", "--bcftools_path", dest="bcfpath", help="path to the bcftools executable. Not necessary if present in BASH PATH", default='')
8185
makedbparser.add_argument("-o", "--out_db_id", dest="db_id", help="output id for database files")
8286
makedbparser.add_argument("-v", "--verbose", action="store_true", dest="logDebug", default=False, help="Show verbose debugging output")
8387
makedbparser.set_defaults(func=makedb_vcf_to_hdf5)
88+
8489
simparser = subparsers.add_parser('simulate', help="Given SNP database, check the genotyping efficiency randomly selecting 'n' number of SNPs")
8590
simparser.add_argument("-d", "--hdf5_file", dest="hdf5File", help="Path to SNP matrix given in binary hdf5 file chunked row-wise")
8691
simparser.add_argument("-e", "--hdf5_acc_file", dest="hdf5accFile", help="Path to SNP matrix given in binary hdf5 file chunked column-wise")
@@ -122,7 +127,8 @@ def snpmatch_parser(args):
122127
if not args['outFile']:
123128
if os.path.isfile(args['inFile'] + ".snpmatch.npz"):
124129
os.remove(args['inFile'] + ".snpmatch.npz")
125-
snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile'])
130+
from snpmatch.core import parsers
131+
parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'], outFile = args['outFile'])
126132

127133
def genotype_cross(args):
128134
#checkARGs(args)

snpmatch/core/csmatch.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import os
1111
import StringIO
1212
import snpmatch
13+
import parsers
1314
import json
1415
import itertools
1516

@@ -92,10 +93,8 @@ def crossWindower(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, outFile)
9293
tempScore1 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs1, dtype=int).T, matchedTarWei[j:j+chunk_size,1]).T, axis=0)
9394
tempScore2 = np.sum(np.multiply(np.array(t1001SNPs == samSNPs2, dtype=int).T, matchedTarWei[j:j+chunk_size,2]).T, axis=0)
9495
ScoreList = ScoreList + tempScore0 + tempScore1 + tempScore2
95-
if(len(TarGTs0[j:j+chunk_size]) > 1):
96+
if(len(TarGTs0[j:j+chunk_size]) >= 1):
9697
NumInfoSites = NumInfoSites + len(TarGTs0[j:j+chunk_size]) - np.sum(numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int), axis = 0)
97-
elif(len(TarGTs0[j:j+chunk_size]) == 1):
98-
NumInfoSites = NumInfoSites + 1 - numpy.ma.masked_less(t1001SNPs, 0).mask.astype(int)
9998
TotScoreList = TotScoreList + ScoreList
10099
TotNumInfoSites = TotNumInfoSites + NumInfoSites
101100
writeBinData(out_file, i, GenotypeData, ScoreList, NumInfoSites)
@@ -197,7 +196,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
197196
score = 0
198197
numinfo = 0
199198
NumMatSNPs = 0
200-
for ind,echr in enumerate(snpmatch.parseChrName(GenotypeData.chrs)[0]):
199+
for ind,echr in enumerate(parsers.parseChrName(GenotypeData.chrs)[0]):
201200
perchrTarPos = np.where(snpCHR == echr)[0]
202201
perchrtarSNPpos = snpPOS[perchrTarPos]
203202
start = GenotypeData.chr_regions[ind][0]
@@ -223,7 +222,7 @@ def crossIdentifier(binLen, snpCHR, snpPOS, snpWEI, DPmean, GenotypeData, Genoty
223222
crossInterpreter(GenotypeData, binLen, outID)
224223

225224
def potatoCrossIdentifier(args):
226-
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
225+
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = args['inFile'], logDebug = args['logDebug'])
227226
log.info("loading genotype files!")
228227
GenotypeData = genotype.load_hdf5_genotype_data(args['hdf5File'])
229228
GenotypeData_acc = genotype.load_hdf5_genotype_data(args['hdf5accFile'])
@@ -252,7 +251,7 @@ def getWindowGenotype(matchedP1, totalMarkers, lr_thres = 2.706):
252251

253252
def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, binLen, outFile, logDebug = True):
254253
## inFile are the SNPs of the sample
255-
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = snpmatch.parseInput(inFile = inFile, logDebug = logDebug)
254+
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = parsers.parseInput(inFile = inFile, logDebug = logDebug)
256255
# identifying the segregating SNPs between the accessions
257256
# only selecting 0 or 1
258257
segSNPsind = np.where((snpsP1 != snpsP2) & (snpsP1 >= 0) & (snpsP2 >= 0) & (snpsP1 < 2) & (snpsP2 < 2))[0]
@@ -272,7 +271,7 @@ def crossGenotypeWindows(commonSNPsCHR, commonSNPsPOS, snpsP1, snpsP2, inFile, b
272271
matchedTarInd = perchrTarPosind[np.where(np.in1d(perchrTarPos, reqPOS))[0]]
273272
matchedTarGTs = snpGT[matchedTarInd]
274273
try:
275-
TarGTBinary = snpmatch.parseGT(matchedTarGTs)
274+
TarGTBinary = parsers.parseGT(matchedTarGTs)
276275
TarGTBinary[np.where(TarGTBinary == 2)[0]] = 4
277276
genP1 = np.subtract(TarGTBinary, snpsP1[matchedAccInd])
278277
genP1no = len(np.where(genP1 == 0)[0])
@@ -299,8 +298,8 @@ def crossGenotyper(args):
299298
log.info("input files: %s and %s" % (args['parents'], args['father']))
300299
if not os.path.isfile(args['parents']) and os.path.isfile(args['father']):
301300
die("either of the input files do not exists, please provide VCF/BED file for parent genotype information")
302-
(p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = snpmatch.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
303-
(p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = snpmatch.parseInput(inFile = args['father'], logDebug = args['logDebug'])
301+
(p1snpCHR, p1snpPOS, p1snpGT, p1snpWEI, p1DPmean) = parsers.parseInput(inFile = args['parents'], logDebug = args['logDebug'])
302+
(p2snpCHR, p2snpPOS, p2snpGT, p2snpWEI, p2DPmean) = parsers.parseInput(inFile = args['father'], logDebug = args['logDebug'])
304303
commonCHRs_ids = np.union1d(p1snpCHR, p2snpCHR)
305304
commonSNPsCHR = np.zeros(0, dtype=commonCHRs_ids.dtype)
306305
commonSNPsPOS = np.zeros(0, dtype=int)
@@ -316,8 +315,8 @@ def crossGenotyper(args):
316315
perchrsnpsP2 = np.repeat(-1, len(perchrPositions)).astype('int8')
317316
perchrsnpsP1_inds = np.where(np.in1d(p1snpPOS[perchrP1inds], perchrPositions))[0]
318317
perchrsnpsP2_inds = np.where(np.in1d(p2snpPOS[perchrP2inds], perchrPositions))[0]
319-
snpsP1 = np.append(snpsP1, snpmatch.parseGT(p1snpGT[perchrsnpsP1_inds]))
320-
snpsP2 = np.append(snpsP2, snpmatch.parseGT(p2snpGT[perchrsnpsP2_inds]))
318+
snpsP1 = np.append(snpsP1, parsers.parseGT(p1snpGT[perchrsnpsP1_inds]))
319+
snpsP2 = np.append(snpsP2, parsers.parseGT(p2snpGT[perchrsnpsP2_inds]))
321320
log.info("done!")
322321
else:
323322
parents = args['parents']

snpmatch/core/makedb.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def makedb_from_vcf(args):
8989
log.info('done!')
9090
elif inType == '.csv':
9191
log.info("converting CSV to hdf5!")
92-
makeHDF5s(args['db_id'] + '.csv', args['db_id'])
92+
makeHDF5s(args['inFile'], args['db_id'])
9393
log.info('done!')
9494
else:
9595
die("please provide either a VCF file or a CSV!")

snpmatch/core/parsers.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import pandas as pd
2+
import numpy as np
3+
import allel
4+
import snpmatch
5+
import logging
6+
import os
7+
import json
8+
9+
log = logging.getLogger(__name__)
10+
11+
def parseGT(snpGT):
12+
first = snpGT[0]
13+
snpBinary = np.zeros(len(snpGT), dtype = "int8")
14+
if first.find('|') != -1:
15+
## GT is phased
16+
separator = "|"
17+
elif first.find('/') != -1:
18+
## GT is not phased
19+
separator = "/"
20+
elif np.char.isdigit(first):
21+
return np.array(np.copy(snpGT), dtype = "int8")
22+
else:
23+
snpmatch.die("unable to parse the format of GT in vcf!")
24+
hetGT = "0" + separator + "1"
25+
refGT = "0" + separator + "0"
26+
altGT = "1" + separator + "1"
27+
nocall = "." + separator + "."
28+
snpBinary[np.where(snpGT == altGT)[0]] = 1
29+
snpBinary[np.where(snpGT == hetGT)[0]] = 2
30+
snpBinary[np.where(snpGT == nocall)[0]] = -1
31+
return snpBinary
32+
33+
def parseChrName(targetCHR):
34+
snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "")
35+
snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast
36+
snpCHR = snpCHROM[snpsREQ]
37+
return (snpCHR, snpsREQ)
38+
39+
def readBED(inFile, logDebug):
40+
log.info("reading the position file")
41+
targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2])
42+
(snpCHR, snpsREQ) = parseChrName(targetSNPs[0])
43+
snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ]
44+
snpGT = np.array(targetSNPs[2])[snpsREQ]
45+
snpBinary = parseGT(snpGT)
46+
snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het
47+
snpWEI[np.where(snpBinary != 0),0] = 0
48+
snpWEI[np.where(snpBinary != 1),2] = 0
49+
snpWEI[np.where(snpBinary != 2),1] = 0
50+
return (snpCHR, snpPOS, snpGT, snpWEI)
51+
52+
def readVcf(inFile, logDebug):
53+
log.info("reading the VCF file")
54+
## We read only one sample from the VCF file
55+
if logDebug:
56+
vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
57+
else:
58+
import StringIO
59+
import sys
60+
sys.stderr = StringIO.StringIO()
61+
vcf = allel.read_vcf(inFile, samples = [0], fields = '*')
62+
#vcf = vcfnp.variants(inFile, cache=False).view(np.recarray)
63+
#vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray)
64+
sys.stderr = sys.__stderr__
65+
(snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM'])
66+
try:
67+
snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
68+
except AttributeError:
69+
snpmatch.die("input VCF file doesnt have required GT field")
70+
snpsREQ = snpsREQ[np.where(snpGT != './.')[0]]
71+
snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0]
72+
if 'calldata/PL' in sorted(vcf.keys()):
73+
snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float')
74+
snpWEI = snpWEI/(-10)
75+
snpWEI = np.exp(snpWEI)
76+
77+
else:
78+
snpBinary = parseGT(snpGT)
79+
snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het
80+
snpWEI[np.where(snpBinary != 0),0] = 0
81+
snpWEI[np.where(snpBinary != 1),2] = 0
82+
snpWEI[np.where(snpBinary != 2),1] = 0
83+
snpCHR = snpCHR[snpsREQ]
84+
DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0])
85+
snpPOS = np.array(vcf['variants/POS'][snpsREQ])
86+
return (DPmean, snpCHR, snpPOS, snpGT, snpWEI)
87+
88+
def parseInput(inFile, logDebug, outFile = "parser"):
89+
if outFile == "parser" or not outFile:
90+
outFile = inFile + ".snpmatch"
91+
if os.path.isfile(inFile + ".snpmatch.npz"):
92+
log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz")
93+
snps = np.load(inFile + ".snpmatch.npz")
94+
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
95+
else:
96+
_,inType = os.path.splitext(inFile)
97+
if inType == '.npz':
98+
log.info("loading snpmatch parser file! %s", inFile)
99+
snps = np.load(inFile)
100+
(snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp'])
101+
else:
102+
log.info('running snpmatch parser!')
103+
if inType == '.vcf':
104+
(DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug)
105+
elif inType == '.bed':
106+
(snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug)
107+
DPmean = "NA"
108+
else:
109+
snpmatch.die("input file type %s not supported" % inType)
110+
log.info("creating snpmatch parser file: %s", outFile + '.npz')
111+
np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean)
112+
NumSNPs = len(snpCHR)
113+
case = 0
114+
note = "Sufficient number of SNPs"
115+
if NumSNPs < snpmatch.snp_thres:
116+
note = "Attention: low number of SNPs provided"
117+
case = 1
118+
snpst = np.unique(snpCHR, return_counts=True)
119+
snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0])))
120+
statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean}
121+
statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT)
122+
with open(outFile + ".stats.json", "w") as out_stats:
123+
out_stats.write(json.dumps(statdict))
124+
log.info("done!")
125+
return (snpCHR, snpPOS, snpGT, snpWEI, DPmean)

0 commit comments

Comments
 (0)