/
removeRepeatedLowerQualSites.r
executable file
·74 lines (58 loc) · 2.44 KB
/
removeRepeatedLowerQualSites.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env Rscript
# This script takes a VCF file as input and removes one of two different variants that are at a same site, keeping the one that
# shows the highest QUAL value. The input VCF could be one generated by the concatenation of two VCFs with `bcftools concat`.
# The ouput VCF is sorted by postitions and conpressed.
#
# The input parameters are (following the right order):
# * `INPUT_VCF`, path of the input VCF file;
# * `OUTPUT_VCF`, path of the output VCF file. If the file extension .vcf.gz is missing, it is added automatically.
#
# To run removeRepeatedLowerQualSites:
# Rscript $PATH_TO_REPO/removeRepeatedLowerQualSites.r \
# $INPUT_VCF \
# $OUTPUT_VCF
### input variables
INPUT_VCF=commandArgs(TRUE)[1]
OUTPUT_VCF=commandArgs(TRUE)[2]
### load packages
library(vcfR)
### make sure that the input VCF is sorted by positions
sorted_vcf <- tempfile(fileext=".vcf.gz")
cmd <- gettextf("bcftools sort -O z %s > %s", INPUT_VCF, sorted_vcf)
system(cmd)
### load the sorted VCF file
vcf <- read.vcfR(sorted_vcf)
unlink(sorted_vcf)
### find repeated sites
pos <- paste(vcf@fix[,1], vcf@fix[,2])
k <- duplicated(pos)
repeated_sites_label <- pos[k]
is_repeated_site <- pos %in% repeated_sites_label
### get QUAL of repeated sites
k <- colnames(vcf@fix) == "QUAL"
quals <- as.numeric( vcf@fix[is_repeated_site ,k] )
### separete QUALs of same sites between two columns (take advantage that the VCF is sorted by position)
repeated_quals_matrix <- matrix(quals, ncol=2, byrow=TRUE)
### make a similar matrix but using site indexes
which_is_repeated_site_matrix <- matrix( which(is_repeated_site), ncol=2, byrow=TRUE )
### remove the site with the lowest QUAL
take_first <- (repeated_quals_matrix[,1] - repeated_quals_matrix[,2]) > 0
k <- which_is_repeated_site_matrix[take_first, 1]
k <- c( k, which_is_repeated_site_matrix[!take_first, 2] )
is_repeated_site[k] <- FALSE
vcf <- vcf[!is_repeated_site]
### write the output VCF file, and make sure its extension is .vcf.gz
if( !grepl("\\.vcf.gz$", OUTPUT_VCF) ){
if( grepl("\\.vcf$", OUTPUT_VCF) ){
OUTPUT_VCF <- paste0(OUTPUT_VCF, ".gz")
}else{
OUTPUT_VCF <- paste0(OUTPUT_VCF, ".vcf.gz")
}
}
write.vcf(vcf, file=OUTPUT_VCF)
### for some reason, bcftools can't read VCF files produced by vcfR::write.vcf
### by decompressing and compressing it again, this problem can be solved!!!
cmd <- gettextf("gunzip %s", OUTPUT_VCF)
system(cmd)
cmd <- gettextf( "bgzip %s", sub("\\.gz$", "", OUTPUT_VCF) )
system(cmd)