/
vcfFilter_cris.pl
117 lines (108 loc) · 2.72 KB
/
vcfFilter_cris.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/perl
use warnings;
use strict;
# this program filters a vcf file based on overall sequence coverage, number of non-reference reads, number of alleles, and reverse orientation reads
# usage vcfFilter.pl infile.vcf
#
# change the marked variables below to adjust settings
#
#### stringency variables, edits as desired
## 205 inds, 2x
my $minCoverage = 410; # minimum number of sequences; DP
my $minAltRds = 10; # minimum number of sequences with the alternative allele; AC
my $notFixed = 1.0; # removes loci fixed for alt; AF
my $zs = 3; # absolute z score cut off for bias tests
my $mq = 30; # minimum mapping quality; MQ
my $miss = 41; # maximum number of individuals with no data
##### this set is for GBS
my $d;
my @line;
my $in = shift(@ARGV);
open (IN, $in) or die "Could not read the infile = $in\n";
$in =~ m/^([a-zA-Z_0-9\-]+\.vcf)$/ or die "Failed to match the variant file\n";
open (OUT, "> filtered2x_$1") or die "Could not write the outfile\n";
my $flag = 0;
my $cnt = 0;
while (<IN>){
chomp;
$flag = 1;
if (m/^\#/){ ## header row, always write
$flag = 1;
if(m/\#CHROM/){ ## simplify ids
s/aln_[a-z]+_([a-zA-Z0-9]+)\.sorted\.bam/$1/g;
}
}
elsif (m/^Sc/){ ## this is a sequence line, you migh need to edit this reg. expr.
$flag = 1;
$d = () = (m/\d\/\d:0,0,0:0/g); ## for bcftools call
if ($d >= $miss){
$flag = 0;
##print "fail missing : ";
}
if (m/[ACTGN]\,[ACTGN]/){ ## two alternative alleles identified
$flag = 0;
#print "fail allele : ";
}
@line = split(/\s+/,$_);
if(length($line[3]) > 1 or length($line[4]) > 1){
$flag = 0;
#print "fail INDEL : ";
}
m/DP=(\d+)/ or die "Syntax error, DP not found\n";
if ($1 < $minCoverage){
$flag = 0;
#print "fail DP : ";
}
m/DP4=\d+,\d+,(\d+),(\d+)/ or die "Syntax error DP4 not found\n";
if(($1 + $2) < $minAltRds){
$flag = 0;
}
m/AF1*=([0-9\.e\-]+)/ or die "Syntax error, AF not found\n";
if ($1 == $notFixed){
$flag = 0;
# print "fail AF : ";
}
# z scores
if(m/BQBZ=([0-9e\-\.]*)/){
if (abs($1) > $zs){
$flag = 0;
# print "fail BQRS : ";
}
}
if(m/MQBZ=([0-9e\-\.]*)/){
if (abs($1) > $zs){
$flag = 0;
# print "fail MQRS : ";
}
}
if(m/RPBZ=([0-9e\-\.]*)/){
if (abs($1) > $zs){
$flag = 0;
# print "fail RPRS : ";
}
}
if(m/MQ=([0-9\.]+)/){
if ($1 < $mq){
$flag = 0;
# print "fail MQ : ";
}
}
else{
$flag = 0;
print "faile no MQ : ";
}
if ($flag == 1){
$cnt++; ## this is a good SNV
}
}
else{
print "Warning, failed to match the chromosome or scaffold name regular expression for this line\n$_\n";
$flag = 0;
}
if ($flag == 1){
print OUT "$_\n";
}
}
close (IN);
close (OUT);
print "Finished filtering $in\nRetained $cnt variable loci\n";