-
Notifications
You must be signed in to change notification settings - Fork 0
/
snp_sequence_puller_auto.sh
220 lines (195 loc) · 6.19 KB
/
snp_sequence_puller_auto.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#!/bin/bash
# Default values
length=200
# Function to display script usage
usage() {
echo
echo "###############################################"
echo "# #"
echo "# Help Information #"
echo "# #"
echo "###############################################"
echo
echo "Usage:"
echo -e "\t$0 [OPTIONS] ARGUMENT"
echo
echo "Description:"
echo -e "\tThis script will take a known reference genome (.fa) and provided single nucleotide polymorphisms (SNPs)"
echo -e "\tand report them back as ormatted SNP with flanking sequences. Input files must contain the chromosome, position"
echo -e "\treference allele, alternate allele, and id of each SNP per each line of the input file. Input files must be"
echo -e "\ttab delimited and the order of the columns is irrelivant"
echo
echo "Options:"
echo -e "\t-h, --help Display this help and exit"
echo -e "\t-v, --verbose Display text feedback (default option is false)"
echo
echo "Arguments:"
echo -e "\t-i, --input-file Input file (tab-delimited)"
echo -e "\t-o, --output-file Name of the output file (tab-delimited)"
echo -e "\t-l, --length Length in bp (default is 200)"
echo -e "\t-r, --reference-geno Reference genome file (.fa)"
echo
echo "Examples:"
echo -e "\t$0 -i 'input_file.txt' -l 200 -r 'reference_genome.fa'"
exit 1
}
# Set verbose to false automatically
verbose=false
first_row=true
# Parse command-line options
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
-i|--input-file)
input_file="$2"
shift
shift
;;
-o|--output_file)
output_file="$2"
shift
shift
;;
-l|--length)
length="$2"
shift
shift
;;
-r|--reference-geno)
reference_geno="$2"
shift
shift
;;
-h|--help)
usage
;;
-v|--verbose)
verbose=true
shift
shift
;;
*)
echo "Unknown option: $1"
usage
;;
esac
done
# Pull script path
path_to_script=$(dirname "$0")
# Check to make sure dependencies exist
if [ ! -f "$path_to_script/snp_sequence_puller.sh" ]; then
echo "Error: snp_sequence_puller.sh does not exist in the script directory. Check the scripts directory."
exit 1
fi
# Check if required options are provided
if [ -z "$input_file" ] || [ -z "$reference_geno" ]; then
echo "Error: Input file and reference genome file are required."
usage
fi
# Check if input file exists
if [ ! -f "$input_file" ]; then
echo "Error: Input file '$input_file' not found."
exit 1
fi
if [ "$verbose" = true ]; then
# Print header
echo
echo "###############################################"
echo "# #"
echo "# SNP Sequence Puller v1.0 #"
echo "# #"
echo "###############################################"
echo
echo "Written by: Zachary J. Winn PhD"
echo "Contact information:"
echo -e "\tGovernment Email: zachary.winn@usda.gov"
echo -e "\tPersonal Email: zwinn@outlook.com"
echo
echo "###############################################"
echo "# WARNING: This program is not under warranty #"
echo "# Use at your own discretion! #"
echo "###############################################"
echo
fi
# Set options
first_row=true
# Read header to get column names
IFS=$'\t' read -r -a headers < "$input_file"
# Find indices of desired columns
chr_index=-1
pos_index=-1
ref_index=-1
alt_index=-1
id_index=-1
for i in "${!headers[@]}"; do
# Trim whitespace from the column name
column_name=$(echo "${headers[$i]}" | tr -d '[:space:]')
if [[ "$column_name" == "chr" ]]; then
chr_index=$i
elif [[ "$column_name" == "pos" ]]; then
pos_index=$i
elif [[ "$column_name" == "ref" ]]; then
ref_index=$i
elif [[ "$column_name" == "alt" ]]; then
alt_index=$i
elif [[ "$column_name" == "id" ]]; then
id_index=$i
fi
done
# Check if all desired columns are found
if [[ $chr_index == -1 || $pos_index == -1 || $ref_index == -1 || $alt_index == -1 || $id_index == -1 ]]; then
echo "Error: Not all required columns (chr, pos, ref, alt, id) found in the input file."
exit 1
fi
if [ "$verbose" = true ]; then
echo "###############################"
echo "# Header read in succesfully! #"
echo "###############################"
echo
fi
if [ "$verbose" = true ]; then
echo "#############################################"
echo "# Pulling SNP sequences from input files... #"
echo "#############################################"
echo
fi
# Open the file for reading
while IFS=$'\t' read -r -a row; do
# Check if it's the first row and skip it
if $first_row; then
first_row=false
# Add header to the output file
echo -e "chr\tpos\tid\tref\talt\tsnp_sequence" > "$output_file"
continue
fi
# Extract data from the desired columns
chr="${row[$chr_index]}"
pos="${row[$pos_index]}"
ref="${row[$ref_index]}"
alt="${row[$alt_index]}"
id="${row[$id_index]}"
# Remove white space
alt="${alt%"${alt##*[![:space:]]}"}"
alt="${alt#"${alt%%[![:space:]]*}"}"
# Remove white space
ref="${ref%"${ref##*[![:space:]]}"}"
ref="${ref#"${ref%%[![:space:]]*}"}"
# Run command and capture output
sequence_output=$(bash "$path_to_script/snp_sequence_puller.sh" \
-f "$reference_geno" \
-p "$pos" \
-l "$length" \
-c "$chr" \
-a "$alt" \
-r "$ref")
# Append to output file with tab delimiter
echo -e "$chr\t$pos\t$id\t$ref\t$alt\t$sequence_output" >> "$output_file"
done < "$input_file"
if [ "$verbose" = true ]; then
# Print message
echo "#########"
echo "# Done! #"
echo "#########"
echo
fi
exit 0