-
Notifications
You must be signed in to change notification settings - Fork 0
/
cpDNA_mappingMETA.sh
151 lines (139 loc) · 5.75 KB
/
cpDNA_mappingMETA.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=24:00:00
#PBS -l select=1:ncpus=1:mem=1gb:scratch_local=1gb
#PBS -j oe
#PBS -N cpDNAmapping
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -q sThC.q
#$ -l mres=1G
#$ -cwd
#$ -j y
#$ -N cpDNAmapping
#$ -o cpDNAmapping.log
#------------------------------------------------------------------------------------
#Tomas Fer, 2018, 2019
#Department of Botany, Charles University, Prague, Czech Republic
#tomas.fer@natur.cuni.cz
#------------------------------------------------------------------------------------
#Maps reads to the full plastome reference (integrated within HybPhyloMaker)
#(1) BWA mapping of filtered reads to cpDNA reference (using filtered reads without duplicates)
#(2) Consensus call with kindel
#(3) Combine sequences to single FASTA file
#(4) Calculate proportion of missing data per samples
#USAGE: ./cpDNA_mappingMETA.sh (or qsub cpDNA_mappingMETA.sh)
#Requirements:
#BWA
#samtools
#kindel v.0.1.4 (pip3 install 'kindel==0.1.4')
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\ncpDNAmapping is running on MetaCentrum..."
#settings for MetaCentrum
#Move to scratch
cd $SCRATCHDIR
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
. /packages/run/modules-2.0/init/bash
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Add necessary modules
module add bwa-0.7.15
module add samtools-1.3
module add python34-modules-gcc #adds also kindel
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\ncpDNAmapping is running on Hydra..."
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir cpDNAmapping
cd cpDNAmapping
#Add necessary modules
module load bioinformatics/bwa/0.7.12
module load bioinformatics/samtools/1.3
module load bioinformatics/anaconda3/5.1 #adds also kindel
else
echo -e "\nHybPhyloMaker9 is running locally..."
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir cpDNAmapping
cd cpDNAmapping
fi
#Copy data (all deduplicated FASTQ files from from all samples)
find $path/20filtered/ -name '*no-dups*' -type f -exec cp '{}' . \;
#Copy reference
cp $source/$cpDNA .
#Make folder for results
mkdir $path/fullplastome
#Index reference
bwa index $cpDNA #index reference
ls *-1P_no-dups* | cut -d"-" -f1,2 > list.txt
for file in $(cat list.txt); do
echo -e "Processing $file"
echo -e "...mapping"
bwa mem $cpDNA ${file}-1P_no-dups.fastq.gz ${file}-2P_no-dups.fastq.gz > ${file}.sam #mapping
echo -e "...converting"
samtools view -bS -o ${file}.bam ${file}.sam #convert to bam
nrmapped=$(samtools view -F 0x904 -c ${file}.bam) #number of mapped reads in BAM
nrall=$(samtools view -c ${file}.bam) #number of all reads in BAM
percmapped=$(echo -e "scale=5;100 * ($nrmapped / $nrall)" | bc) #calculate percentage of mapped reads
echo -e "$file\t$nrall\t$nrmapped\t$percmapped" >> mapping_summary.txt
cp mapping_summary.txt $path/fullplastome
echo -e "...sorting"
samtools sort ${file}.bam -o ${file}_sorted.bam
echo -e "...indexing"
samtools index ${file}_sorted.bam
cp ${file}_sorted.bam $path/fullplastome/${file}.bam
cp ${file}_sorted.bam.bai $path/fullplastome/${file}.bam.bai
kindel -m $mincov -t $majthres ${file}_sorted.bam > ${file}.fasta
#change name in fasta file
sed -i '1d' ${file}.fasta #delete first line
echo ">$file" > header.txt
cat header.txt ${file}.fasta > tmp && mv tmp ${file}.fasta
rm header.txt ${file}.sam ${file}.bam #cleaning
#Remove line breaks from fasta file
awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' ${file}.fasta > tmp && mv tmp ${file}.fasta
cp ${file}.fasta $path/fullplastome/${file}.fasta
done
#combine fasta files
cat *.fasta > consensus.fasta
#Remove line breaks from fasta file
awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' consensus.fasta > tmp && mv tmp consensus.fasta
cp consensus.fasta $path/fullplastome/fullplastomes_consensus.fasta
#Calculate percentage of missing data per accession
#Calculate length of alignment: 1. get second line and count length, 2. decrease value by one (because previous command also counted LF)
length=$(cat consensus.fasta | head -n 2 | tail -n 1 | wc -c)
length=`expr $length - 1`
#Replace newline with ' ' if line starts with '>' (i.e., merge headers with data into single line separated by space)
cat consensus.fasta | sed '/^>/{N; s/\n/ /;}' > consensus.modif.fasta
#Cut first part until space, i.e. header, and remove '>'
cat consensus.modif.fasta | cut -f1 -d" " | sed 's/>//' > headers.txt
#Cut only part after the first space, i.e., only sequence, change all missing data (-, ?, N) to 'n', replace all other characters then 'n' by nothing and print percentage of 'n's in each sequence
cat consensus.modif.fasta | cut -f2 -d" " | sed 's/[?N-]/n/g' | sed 's/[^n]//g' | awk -v val=$length '{ print (length*100)/val }' > missingpercentage.txt
paste headers.txt missingpercentage.txt > fullplastomes_missingperc.txt
#Calculate mean of all values
echo -e "MEAN\t$(awk '{ sum += $2; n++ } END { if (n > 0) print sum / n; }' fullplastomes_missingperc.txt)" > mean.txt
cat fullplastomes_missingperc.txt mean.txt > tmp && mv tmp fullplastomes_missingperc.txt
rm consensus.modif.fasta headers.txt missingpercentage.txt mean.txt
#Copy results to home
cp fullplastomes_missingperc.txt $path/fullplastome
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r cpDNAmapping
fi
echo -e "\ncpDNA mapping finished...\n"