Skip to content
Permalink
master
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
executable file 172 lines (143 sloc) 5.92 KB
#!/bin/sh
#
# gene-cutter - excise sequence sections matching a template sequence
# Copyright (C) 2016 Marco van Zwetselaar <io@zwets.it>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Part of http://io.zwets.it/blast-galley
# Defaults
PERC_ID=90.0
E_VALUE=0.01
QCOV_PC=60
STATS=1
TAB="$(printf '\t')"
# Function to emit information to standard error if VERBOSE is set
emit() {
[ -z "$VERBOSE" ] || echo "$(basename "$0"): $*" >&2
}
# Function to exit this script with an error message on stderr
err_exit() {
echo "$(basename "$0"): $*" >&2
exit 1
}
# BLAST output format specifier and column offsets
BLAST_OUTFMT="6 qacc qstart qend qlen sacc sstart send length evalue pident bitscore qcovs qcovhsp nident gapopen gaps mismatch stitle sseq"
# $ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Function to perform the BLAST query; args $* are added to its end
blast_query() {
if [ -z "$PROTEIN" ]; then
blastn -task blastn ${DATABASE:+-num_threads $(nproc)} \
-evalue "$E_VALUE" -perc_identity "$PERC_ID" -qcov_hsp_perc "$QCOV_PC" \
-query "$TEMPLATE_FILE" -outfmt "$BLAST_OUTFMT" $*
else
tblastn -task tblastn ${DATABASE:+-num_threads $(nproc)} \
-evalue "$E_VALUE" -qcov_hsp_perc "$QCOV_PC" \
-query "$TEMPLATE_FILE" -outfmt "$BLAST_OUTFMT" $*
fi
}
# Function to process the BLAST output into FASTA
process_output() {
sort -t "$TAB" -k11,11nr |
if [ $BEST ]; then head -1; else cat; fi |
awk -F "$TAB" -v INFO="${INFO:-Segment}" -v STATS=$STATS '{
printf ">lcl|cut%04d %s from %s %s matching %s", NR, INFO, $5, $18, $1
if (STATS) printf " (len %d/%d %.1f%, pos %d..%d; e=%.3f, p=%.1f%, b=%s; i=%d, g=%d)", $8, $4, $13, $6, $7, $9, $10, $11, $14, $15
printf "\n"
# We need to strip the dashed deletions
gsub("-","",$NF)
print $NF
}'
}
# Function to show usage information and exit
usage_exit() {
echo "
Usage: $(basename $0) [OPTIONS] TEMPLATE [TARGET ...]
Excise from each TARGET all sections that match TEMPLATE.
TEMPLATE is a FASTA file. TARGETs are either FASTA files or sequence
identifiers in BLAST database DB. When no TARGET is specified, reads
FASTA from stdin, or with option --database searches all of DB.
Matching segments are written to stdout as FASTA, in decreasting order
of bitscore. With option --best, only the top scoring match is output.
Note that matching segments that straddle two or more contigs will not
be found. Read the README at https://github.com/zwets/blast-galley.
The TEMPLATE can be a nucleotide (default) or amino acid sequence (use
option --translate), or multiple sequences. The TARGET must currently
be nucleotides.
OPTIONS
-d, --database DB match against sequences in DB rather than in files
-p, --perc-ident N percentage identity threshold (default: $PERC_ID)
-e, --e-value N e-value threshold (default: $E_VALUE)
-c, --coverage N query coverage percentage threshold (default: $QCOV_PC)
-t, --protein do a protein search against nucleotide database (tblastn)
-b, --best output only the best match (by bitscore)
-i, --info TEXT add TEXT to FASTA headers of generated sequences
-n, --no-stats omit match statistics from the FASTA headers
-v, --verbose emit progress messages to standard error
The generated FASTA headers can be customised using options -i and -n.
Options -p, -e, and -c specify the matching accuracy parameters.
"
exit ${1:-1}
}
# Parse options
unset DATABASE INFO VERBOSE PROTEIN BEST
while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\).*')" = "-" ]; do
case $1 in
--db=*) DATABASE="${1#--db=}" ;;
-d|-db|--db) shift; DATABASE="$1" ;;
-n|--no-stats) STATS=0 ;;
-i|--info) shift; INFO=$1 ;;
--info=*) INFO="${1#--info=}" ;;
-p|--perc-id*) shift; PERC_ID=$1 ;;
--perc-ident=*) PERC_ID="${1#--perc-ident=}" ;;
-e|--e-value) shift; E_VALUE=$1 ;;
--e-value=*) E_VALUE="${1#--e-value=}" ;;
-c|--cov*) shift; QCOV_PC=$1 ;;
--coverage=*) QCOV_PC="${1#--coverage=}" ;;
-b|--best) BEST=1 ;;
-t|--protein) PROTEIN=1 ;;
-v|--verbose) VERBOSE=1 ;;
-h|--help) usage_exit 0 ;;
*) usage_exit ;;
esac
shift || usage_exit
done
# Check the arguments
[ $# -ge 1 ] || usage_exit
TEMPLATE_FILE="$1"
shift
[ -r "$TEMPLATE_FILE" ] || err_exit "cannot read file: $TEMPLATE_FILE"
# Perform the blast(s)
if [ -n "$DATABASE" ]; then
# When DATABASE is specified, optional TARGETs are the sequence identifiers
if [ $# -ne 0 ]; then
IDS_FILE="/tmp/$(basename "$0").$$.tmp"
while [ $# -ne 0 ]; do echo $1; shift; done >"$IDS_FILE"
fi
emit "searching database: $DATABASE"
blast_query -db $DATABASE ${IDS_FILE:+-seqidlist "$IDS_FILE"} | process_output
[ -z "$IDS_FILE" ] || rm -f "$IDS_FILE"
elif [ $# -eq 0 ]; then
# When no DATABASE and no arguments, then FASTA expected on standard input
emit "reading FASTA from standard input"
blast_query -subject "/dev/stdin" | process_output
else
# Otherwise, arguments are names of FASTA files processed in sequence
while [ $# -ne 0 ]; do
emit "processing: $1"
blast_query -subject "$1"
shift
done | process_output
fi
# vim: sts=4:sw=4:ai:si:et: