|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import allel |
| 4 | +import snpmatch |
| 5 | +import logging |
| 6 | +import os |
| 7 | +import json |
| 8 | + |
| 9 | +log = logging.getLogger(__name__) |
| 10 | + |
| 11 | +def parseGT(snpGT): |
| 12 | + first = snpGT[0] |
| 13 | + snpBinary = np.zeros(len(snpGT), dtype = "int8") |
| 14 | + if first.find('|') != -1: |
| 15 | + ## GT is phased |
| 16 | + separator = "|" |
| 17 | + elif first.find('/') != -1: |
| 18 | + ## GT is not phased |
| 19 | + separator = "/" |
| 20 | + elif np.char.isdigit(first): |
| 21 | + return np.array(np.copy(snpGT), dtype = "int8") |
| 22 | + else: |
| 23 | + snpmatch.die("unable to parse the format of GT in vcf!") |
| 24 | + hetGT = "0" + separator + "1" |
| 25 | + refGT = "0" + separator + "0" |
| 26 | + altGT = "1" + separator + "1" |
| 27 | + nocall = "." + separator + "." |
| 28 | + snpBinary[np.where(snpGT == altGT)[0]] = 1 |
| 29 | + snpBinary[np.where(snpGT == hetGT)[0]] = 2 |
| 30 | + snpBinary[np.where(snpGT == nocall)[0]] = -1 |
| 31 | + return snpBinary |
| 32 | + |
| 33 | +def parseChrName(targetCHR): |
| 34 | + snpCHROM = np.char.replace(np.core.defchararray.lower(np.array(targetCHR, dtype="string")), "chr", "") |
| 35 | + snpsREQ = np.where(np.char.isdigit(snpCHROM))[0] ## Filtering positions from mitochondrial and chloroplast |
| 36 | + snpCHR = snpCHROM[snpsREQ] |
| 37 | + return (snpCHR, snpsREQ) |
| 38 | + |
| 39 | +def readBED(inFile, logDebug): |
| 40 | + log.info("reading the position file") |
| 41 | + targetSNPs = pd.read_table(inFile, header=None, usecols=[0,1,2]) |
| 42 | + (snpCHR, snpsREQ) = parseChrName(targetSNPs[0]) |
| 43 | + snpPOS = np.array(targetSNPs[1], dtype=int)[snpsREQ] |
| 44 | + snpGT = np.array(targetSNPs[2])[snpsREQ] |
| 45 | + snpBinary = parseGT(snpGT) |
| 46 | + snpWEI = np.ones((len(snpCHR), 3)) ## for homo and het |
| 47 | + snpWEI[np.where(snpBinary != 0),0] = 0 |
| 48 | + snpWEI[np.where(snpBinary != 1),2] = 0 |
| 49 | + snpWEI[np.where(snpBinary != 2),1] = 0 |
| 50 | + return (snpCHR, snpPOS, snpGT, snpWEI) |
| 51 | + |
| 52 | +def readVcf(inFile, logDebug): |
| 53 | + log.info("reading the VCF file") |
| 54 | + ## We read only one sample from the VCF file |
| 55 | + if logDebug: |
| 56 | + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') |
| 57 | + else: |
| 58 | + import StringIO |
| 59 | + import sys |
| 60 | + sys.stderr = StringIO.StringIO() |
| 61 | + vcf = allel.read_vcf(inFile, samples = [0], fields = '*') |
| 62 | + #vcf = vcfnp.variants(inFile, cache=False).view(np.recarray) |
| 63 | + #vcfD = vcfnp.calldata_2d(inFile, cache=False).view(np.recarray) |
| 64 | + sys.stderr = sys.__stderr__ |
| 65 | + (snpCHR, snpsREQ) = parseChrName(vcf['variants/CHROM']) |
| 66 | + try: |
| 67 | + snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] |
| 68 | + except AttributeError: |
| 69 | + snpmatch.die("input VCF file doesnt have required GT field") |
| 70 | + snpsREQ = snpsREQ[np.where(snpGT != './.')[0]] |
| 71 | + snpGT = allel.GenotypeArray(vcf['calldata/GT']).to_gt()[snpsREQ, 0] |
| 72 | + if 'calldata/PL' in sorted(vcf.keys()): |
| 73 | + snpWEI = np.copy(vcf['calldata/PL'][snpsREQ, 0]).astype('float') |
| 74 | + snpWEI = snpWEI/(-10) |
| 75 | + snpWEI = np.exp(snpWEI) |
| 76 | + |
| 77 | + else: |
| 78 | + snpBinary = parseGT(snpGT) |
| 79 | + snpWEI = np.ones((len(snpsREQ), 3)) ## for homo and het |
| 80 | + snpWEI[np.where(snpBinary != 0),0] = 0 |
| 81 | + snpWEI[np.where(snpBinary != 1),2] = 0 |
| 82 | + snpWEI[np.where(snpBinary != 2),1] = 0 |
| 83 | + snpCHR = snpCHR[snpsREQ] |
| 84 | + DPmean = np.mean(vcf['calldata/DP'][snpsREQ,0]) |
| 85 | + snpPOS = np.array(vcf['variants/POS'][snpsREQ]) |
| 86 | + return (DPmean, snpCHR, snpPOS, snpGT, snpWEI) |
| 87 | + |
| 88 | +def parseInput(inFile, logDebug, outFile = "parser"): |
| 89 | + if outFile == "parser" or not outFile: |
| 90 | + outFile = inFile + ".snpmatch" |
| 91 | + if os.path.isfile(inFile + ".snpmatch.npz"): |
| 92 | + log.info("snpmatch parser dump found! loading %s", inFile + ".snpmatch.npz") |
| 93 | + snps = np.load(inFile + ".snpmatch.npz") |
| 94 | + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) |
| 95 | + else: |
| 96 | + _,inType = os.path.splitext(inFile) |
| 97 | + if inType == '.npz': |
| 98 | + log.info("loading snpmatch parser file! %s", inFile) |
| 99 | + snps = np.load(inFile) |
| 100 | + (snpCHR, snpPOS, snpGT, snpWEI, DPmean) = (snps['chr'], snps['pos'], snps['gt'], snps['wei'], snps['dp']) |
| 101 | + else: |
| 102 | + log.info('running snpmatch parser!') |
| 103 | + if inType == '.vcf': |
| 104 | + (DPmean, snpCHR, snpPOS, snpGT, snpWEI) = readVcf(inFile, logDebug) |
| 105 | + elif inType == '.bed': |
| 106 | + (snpCHR, snpPOS, snpGT, snpWEI) = readBED(inFile, logDebug) |
| 107 | + DPmean = "NA" |
| 108 | + else: |
| 109 | + snpmatch.die("input file type %s not supported" % inType) |
| 110 | + log.info("creating snpmatch parser file: %s", outFile + '.npz') |
| 111 | + np.savez(outFile, chr = snpCHR, pos = snpPOS, gt = snpGT, wei = snpWEI, dp = DPmean) |
| 112 | + NumSNPs = len(snpCHR) |
| 113 | + case = 0 |
| 114 | + note = "Sufficient number of SNPs" |
| 115 | + if NumSNPs < snpmatch.snp_thres: |
| 116 | + note = "Attention: low number of SNPs provided" |
| 117 | + case = 1 |
| 118 | + snpst = np.unique(snpCHR, return_counts=True) |
| 119 | + snpdict = dict(('Chr%s' % snpst[0][i], snpst[1][i]) for i in range(len(snpst[0]))) |
| 120 | + statdict = {"interpretation": {"case": case, "text": note}, "snps": snpdict, "num_of_snps": NumSNPs, "depth": DPmean} |
| 121 | + statdict['percent_heterozygosity'] = snpmatch.getHeterozygosity(snpGT) |
| 122 | + with open(outFile + ".stats.json", "w") as out_stats: |
| 123 | + out_stats.write(json.dumps(statdict)) |
| 124 | + log.info("done!") |
| 125 | + return (snpCHR, snpPOS, snpGT, snpWEI, DPmean) |
0 commit comments