## Process all the data using TPP

In [22]:
%%bash
#SET THESE VARIABLES FOR YOUR LOCAL FILE STRUCTURE:
PATH_TO_DATA=input #Note: Any fastq files located here will be processed for alignment with tpp
OUT_DIR=output
REFS=input/Genome/MAC109.fa
GENBANK_FILE=input/Genome/MAC109.gb


#Definition of variables for processing
PYTHON2=$(which python2)
BWA=$(which bwa)
BWA_ALG="aln"

REPLICON_ID="CP029332,CP029333,CP029334"
FASTQ_DIR=$PATH_TO_DATA

PREFIXES_OUTFILE=$OUT_DIR/`basename $FASTQ_DIR`_prefixes.txt

# These are used for creating a CSV file
CSV_OUTFILE=$OUT_DIR/`basename $FASTQ_DIR`.csv
UNIQUE_FIELDS="locus_tag"
FIELDS="product regulatory_class bound_moiety"

#Parameter settings for tpp
PRIMER=AACCTGTTA
MISMATCHES=2
WINDOW_SIZE=6

###################################################################

#Process raw fastq files using tpp
COUNTER=0
INITIAL_START_TIME=$SECONDS
#for FASTQ in $FASTQ_DIR/*_1.fastq; do
for FASTQ in $FASTQ_DIR/*TCGGAA_1.fastq; do
 (( COUNTER += 1 ))
 echo "******** Run $COUNTER: $FASTQ ********"
 READS1=$FASTQ
 READS2=${FASTQ/_1.fastq/_2.fastq}

 OUTNAME=$(basename $FASTQ)
 OUTNAME=${OUTNAME/_1.fastq/}
 ITERATION_START_TIME=$SECONDS
  tpp -himar1 -bwa $BWA -bwa-alg $BWA_ALG -ref $REFS -replicon-ids $REPLICON_ID -reads1 $READS1 -reads2 $READS2 -window-size $WINDOW_SIZE -primer $PRIMER -mismatches $MISMATCHES -output $OUT_DIR/$OUTNAME &
 ITERATION_END_TIME=$SECONDS
 (( ITERATION_TIME = ITERATION_END_TIME - ITERATION_START_TIME ))

 (( TOTAL_RUN_TIME = SECONDS - INITIAL_START_TIME )) 
 (( CURRENT_AVG = TOTAL_RUN_TIME / COUNTER ))
 echo "******** TPP finished in $ITERATION_TIME seconds! Average iteration time over $COUNTER iterations:  $CURRENT_AVG seconds. ********"
done

wait
echo "Creating prefixes file with all prefixes from all runs..."
basename -a $OUT_DIR/*.wig | cut -c-16 | uniq > $PREFIXES_OUTFILE
echo "Created '$PREFIXES_OUTFILE'."
echo ""
echo "Creating CSV file with all samples processed by TPP..."
$PYTHON2 scripts/wig_gb_to_csv.py -l $PREFIXES_OUTFILE -g $GENBANK_FILE -u $UNIQUE_FIELDS -f $FIELDS -o $CSV_OUTFILE
echo "Created '$CSV_OUTFILE'."
echo ""
echo "********** TPP driver script finished in a total of $TOTAL_RUN_TIME seconds **********"

******** Run 1: input/HJKK5BCX2_TCGGAA_1.fastq ********
******** TPP finished in 0 seconds! Average iteration time over 1 iterations:  0 seconds. ********
# title: Tn-Seq Pre-Processor
# date: 08/16/2020 16:46:03
# command: python /home/will/src_and_bin/Other_Software/miniconda3/envs/tnseq_avium_abx_fordistro/bin/tpp -himar1 -bwa /home/will/src_and_bin/Other_Software/miniconda3/envs/tnseq_avium_abx_fordistro/bin/bwa -bwa-alg aln -ref input/Genome/MAC109.fa -replicon-ids CP029332,CP029333,CP029334 -reads1 input/HJKK5BCX2_TCGGAA_1.fastq -reads2 input/HJKK5BCX2_TCGGAA_2.fastq -window-size 6 -primer AACCTGTTA -mismatches 2 -output output/HJKK5BCX2_TCGGAA
# transposon type: Himar1
# protocol type: Sassetti
# bwa flags:
# read1: input/HJKK5BCX2_TCGGAA_1.fastq
# read2: input/HJKK5BCX2_TCGGAA_2.fastq
# ref_genome: input/Genome/MAC109.fa
# replicon_ids: CP029332,CP029333,CP029334
# total_reads (or read pairs): 5185840
# trimmed_reads (reads with valid Tn prefix, and insert size>20bp): 4739037
#

[tn_preprocess] running pre-processing on input/HJKK5BCX2_TCGGAA_1.fastq
[tn_preprocess] protocol: Sassetti
[tn_preprocess] transposon type: Himar1
[tn_preprocess] One reference genome specified: input/Genome/MAC109.fa, containing 3 records.
[tn_preprocess] extracting reads...
[tn_preprocess] fastq2reads: input/HJKK5BCX2_TCGGAA_1.fastq -> output/HJKK5BCX2_TCGGAA.reads1
[tn_preprocess] 1000000 reads processed
[tn_preprocess] 2000000 reads processed
[tn_preprocess] 3000000 reads processed
[tn_preprocess] 4000000 reads processed
[tn_preprocess] 5000000 reads processed
[tn_preprocess] fastq2reads: input/HJKK5BCX2_TCGGAA_2.fastq -> output/HJKK5BCX2_TCGGAA.reads2
[tn_preprocess] 1000000 reads processed
[tn_preprocess] 2000000 reads processed
[tn_preprocess] 3000000 reads processed
[tn_preprocess] 4000000 reads processed
[tn_preprocess] 5000000 reads processed
[tn_preprocess] fixing headers of paired reads for bwa...
[tn_preprocess] 1000000 reads processed
[tn_preprocess] 2000000 reads proces

In [None]:
%%