Skip to content
Permalink
Branch: master
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 187 lines (159 sloc) 5.47 KB
#!/bin/sh
#
# gene-cutter - excise sequence sections matching a template sequence
# Copyright (C) 2016 Marco van Zwetselaar <io@zwets.it>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Part of http://io.zwets.it/blast-galley
# Defaults
PERC_ID=90.0
E_VALUE=0.01
QCOV_PC=60
STATS=1
# Function to emit information to standard error if VERBOSE is set
emit() {
[ -z "$VERBOSE" ] || echo "$(basename "$0"): $*" >&2
}
# Function to exit this script with an error message on stderr
err_exit() {
echo "$(basename "$0"): $*" >&2
exit 1
}
# BLAST output format specifier and column offsets
BLAST_OUTFMT="6 qacc qstart qend qlen sacc sstart send length evalue pident bitscore qcovs qcovhsp nident gapopen gaps mismatch stitle sseq"
# $ 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
# Function to perform the BLAST query; args $* are added to its end
blast_query() {
blastn -task blastn ${DATABASE:+-num_threads $(nproc)} -parse_deflines \
-evalue "$E_VALUE" -perc_identity "$PERC_ID" -qcov_hsp_perc "$QCOV_PC" \
-query "$TEMPLATE_FILE" -outfmt "$BLAST_OUTFMT" $*
}
# Function to process the BLAST output into FASTA
process_output() {
awk -b -O -F '\t' -v INFO="${INFO:-Segment}" -v STATS=$STATS '{
printf ">lcl|cut%04d %s from %s %s matching %s", NR, INFO, $5, $18, $1
if (STATS) printf " (len %d/%d %.1f%, pos %d..%d; e=%.3f, p=%.1f%, b=%s; i=%d, g=%d)", $8, $4, $13, $6, $7, $9, $10, $11, $14, $15
printf "\n"
# We need to strip the dashed deletions
print gensub("-","","g",$NF)
}'
}
# Function to show usage information and exit
usage_exit() {
echo "
Usage: $(basename $0) [OPTIONS] TEMPLATE [TARGET ...]
Excise from each TARGET all sections that match TEMPLATE.
TEMPLATE is a FASTA file. TARGETs are either FASTA files or sequence
identifiers in BLAST database DB. When no TARGET is specified, reads
FASTA from standard input, or with option --database searches all
sequences in DB.
Matching segments are written to stdout as FASTA. Note that matching
segments that straddle two or more contigs will not be found. See the
README at github.com/zwets/blast-galley for more information.
OPTIONS
-d, --database DB match against sequences in DB rather than in files
-p, --perc-ident N percentage identity threshold (default: $PERC_ID)
-e, --e-value N e-value threshold (default: $E_VALUE)
-c, --coverage N query coverage percentage threshold (default: $QCOV_PC)
-i, --info TEXT add TEXT to FASTA headers of generated sequences
-n, --no-stats omit match statistics from the FASTA headers
-v, --verbose emit progress messages to standard error
The generated FASTA headers can be customised using options -i and -n.
Options -p, -e, and -c specify the matching accuracy parameters.
"
exit ${1:-1}
}
# Parse options
unset DATABASE INFO VERBOSE
while [ $# -ne 0 -a "$(expr "$1" : '\(.[0-9]*\).*')" = "-" ]; do
case $1 in
--db=*)
DATABASE="${1#--db=}"
;;
-d|-db|--db)
shift
DATABASE="$1"
;;
-n|--no-stats)
STATS=0
;;
-i|--info)
shift
INFO=$1
;;
--info=*)
INFO="${1#--info=}"
;;
-p|--perc-id*)
shift
PERC_ID=$1
;;
--perc-ident=*)
PERC_ID="${1#--perc-ident=}"
;;
-e|--e-value)
shift
E_VALUE=$1
;;
--e-value=*)
E_VALUE="${1#--e-value=}"
;;
-c|--cov*)
shift
QCOV_PC=$1
;;
--coverage=*)
QCOV_PC="${1#--coverage=}"
;;
-v|--verbose)
VERBOSE=1
;;
-h|--help)
usage_exit 0
;;
*) usage_exit
;;
esac
shift
done
# Check the arguments
[ $# -ge 1 ] || usage_exit
TEMPLATE_FILE="$1"
shift
[ -r "$TEMPLATE_FILE" ] || err_exit "cannot read file: $TEMPLATE_FILE"
[ $(grep '^>' "$TEMPLATE_FILE" | wc -l) -eq 1 ] || err_exit "file must have one sequence: $TEMPLATE_FILE"
# Perform the blast(s)
if [ -n "$DATABASE" ]; then
# When DATABASE is specified, optional TARGETs are the sequence identifiers
if [ $# -ne 0 ]; then
IDS_FILE="/tmp/$(basename "$0").$$.tmp"
while [ $# -ne 0 ]; do echo $1; shift; done >"$IDS_FILE"
fi
emit "searching database: $DATABASE"
blast_query -db $DATABASE ${IDS_FILE:+-seqidlist "$IDS_FILE"} | process_output
[ -z "$IDS_FILE" ] || rm -f "$IDS_FILE"
elif [ $# -eq 0 ]; then
# When no DATABASE and no arguments, then FASTA expected on standard input
emit "reading FASTA from standard input"
blast_query -subject "/dev/stdin" | process_output
else
# Otherwise, arguments are names of FASTA files processed in sequence
while [ $# -ne 0 ]; do
emit "processing: $1"
blast_query -subject "$1"
shift
done | process_output
fi
# vim: sts=4:sw=4:ai:si:et:
You can’t perform that action at this time.