## Process all the data using TPP

In [10]:
%%bash
#SET THESE VARIABLES FOR YOUR LOCAL FILE STRUCTURE:
PATH_TO_DATA=input/fastq #Note: Any fastq files located here will be processed for alignment with tpp
OUT_DIR=output
TPP_OUT_DIR=tpp_out
REFS=input/Genome/GCF_000195955.2_ASM19595v2_genomic.fna
GENBANK_FILE=input/Genome/GCF_000195955.2_ASM19595v2_genomic.gbff

#Definition of variables for processing
PYTHON2=$(which python2)
BWA=$(which bwa)
BWA_ALG="aln"

REPLICON_ID="NC_000962.3"
FASTQ_DIR=$PATH_TO_DATA

PREFIXES_OUTFILE=$OUT_DIR/$TPP_OUT_DIR/`basename $FASTQ_DIR`_prefixes.txt

# These are used for creating a CSV file
CSV_OUTFILE=$OUT_DIR/`basename $FASTQ_DIR`.csv
UNIQUE_FIELDS="locus_tag"
FIELDS="product regulatory_class bound_moiety"

#Parameter settings for tpp
PRIMER=AACCTGTTA
MISMATCHES=2
WINDOW_SIZE=6

###################################################################

#Process raw fastq files using tpp
COUNTER=0
INITIAL_START_TIME=$SECONDS
for FASTQ in $FASTQ_DIR/*_1.fastq; do
    echo "******** Run $COUNTER: $FASTQ ********"
    READS1=$FASTQ
    READS2=${FASTQ/_1.fastq/_2.fastq}

    OUTNAME=$(basename $FASTQ)
    OUTNAME=${OUTNAME/_1.fastq/}
    tpp -himar1 -bwa $BWA -bwa-alg $BWA_ALG -ref $REFS -replicon-ids $REPLICON_ID -reads1 $READS1 -reads2 $READS2 \
       -window-size $WINDOW_SIZE -primer $PRIMER -mismatches $MISMATCHES -output $OUT_DIR/$TPP_OUT_DIR/$OUTNAME &
done
wait

echo "Creating prefixes file with all prefixes from all runs..."
basename -a $OUT_DIR/$TPP_OUT_DIR/*.wig | rev | cut -c-4 --complement | rev | uniq > $PREFIXES_OUTFILE
echo "Created '$PREFIXES_OUTFILE'."
echo ""
echo "Creating CSV file with all samples processed by TPP..."
$PYTHON2 scripts/wig_gb_to_csv.py -l $PREFIXES_OUTFILE -g $GENBANK_FILE -u $UNIQUE_FIELDS -f $FIELDS -o $CSV_OUTFILE
echo "Created '$CSV_OUTFILE'."
echo ""
(( TOTAL_RUN_TIME = SECONDS - INITIAL_START_TIME )) 
echo "********** TPP driver script finished in a total of $TOTAL_RUN_TIME seconds **********"

******** Run 0: input/fastq/SRR23906901_1.fastq ********
******** Run 0: input/fastq/SRR23906902_1.fastq ********
Creating prefixes file with all prefixes from all runs...
Created 'output/tpp_out/fastq_prefixes.txt'.

Creating CSV file with all samples processed by TPP...
[+] scripts/wig_gb_to_csv.py starting with parameters:
....Wiggle prefix(es)         output/tpp_out/SRR23906901, output/tpp_out/SRR23906902
....Record ID(s)              NC_000962.3
....GenBank file:             input/Genome/GCF_000195955.2_ASM19595v2_genomic.gbff
....Unique identifier fields: locus_tag, record_id_start_end_strand
....Fields:                   product, regulatory_class, bound_moiety
....Output file:              output/fastq.csv
[+] Combining Wiggles with prefix '['output/tpp_out/SRR23906901', 'output/tpp_out/SRR23906902']' with Record ID 'NC_000962.3' from 'input/Genome/GCF_000195955.2_ASM19595v2_genomic.gbff'...
[+] Finished processing Record ID 'NC_000962.3' successfully in 4.361125 seconds.
[+] Fi



In [26]:
%%bash
python3 scripts/hypersus_analysis.py 

JT test run time: 2.127500295639038
JT test run time: 2.0444419384002686


  for name,g in df.groupby(['uid'],sort=False):
  for name,g in df.groupby(['uid'],sort=False):


In [1]:
#Reorganize data for TableS1
infold = 'output/hypersus_analysis/'
outfold = 'output/hypersus_analysis/'

lfc_thresh = 0.5
padj_thresh = 0.05

import pandas as pd
import numpy as np
from math import log10, floor

def round_sig(x, sig=3):
    #Implement significant figures in python
    if np.isnan(x):
        return(np.nan)
    return round(x, sig-int(floor(log10(abs(x))))-1)

def make_tables(infilename12h, outfilename, treatment, drug_dose_ls):
    #Merges tables output from analysis and changes formatting for disribution (TableS1)
    
    abx12_df = pd.read_csv(infilename12h).rename({'pval-adj (BH)':'padj'},axis='columns')

    abx12_df[treatment+['pval','padj']] = abx12_df[treatment+['pval','padj']].applymap(round_sig)

    merged_df = abx12_df
    merged_df['Prediction'] = 'Not Significant'
    hypersus_bool = ((merged_df['padj'] < padj_thresh) & \
                (merged_df[treatment[-1]] < -lfc_thresh))
    hypertol_bool = ((merged_df['padj'] < padj_thresh) & \
                (merged_df[treatment[-1]] > lfc_thresh))

    merged_df = merged_df.rename({'uid':'Genomic Feature','product':'Genbank Annotation'}, axis=1)
    merged_df = merged_df.rename({treatment[0]:drug_dose_ls[0] + '- LFC'}, axis=1)
    
    merged_df['Prediction'][hypersus_bool]='Hypersusceptible'
    merged_df['Prediction'][hypertol_bool]='Hypertolerant'

    merged_df.to_csv(outfilename,index=False)

treatment = ['H+']
drug_dose_ls = ['Isoniazid (0.01ug/mL)']
make_tables(infold+'summary_6d_INH.csv', outfold+'TableS1_INH.csv', treatment, drug_dose_ls)

treatment = ['R+']
drug_dose_ls = ['Rifampin (0.04ug/mL)']
make_tables(infold+'summary_6d_RMP.csv', outfold+'TableS2_RMP.csv', treatment, drug_dose_ls)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [2]:
%%bash
python3 scripts/make_barplots.py

Number of significant hypersusceptible mutants (INH): 2
Number of significant hypertolerant mutants (INH): 0
Number of significant hypersusceptible mutants (RMP): 126
Number of significant hypertolerant mutants (RMP): 92
          uid       LFC          pval          padj
1009   Rv0994 -3.251154  5.123703e-06  1.126248e-04
49     Rv0049 -2.987881  8.270262e-07  2.189740e-05
2191  Rv2179c -2.719441  8.150105e-07  2.174398e-05
482   Rv0472c -2.287663  9.798631e-09  4.227928e-07
1453   Rv1433 -2.083503  1.943764e-12  1.698364e-10
2063  Rv2047c -1.870000  1.238545e-21  8.657431e-19
2237  Rv2224c -1.803191  5.331919e-19  1.694096e-16
3295   Rv3267 -1.787550  1.409788e-13  1.642403e-11
2548  Rv2525c -1.775899  2.178612e-12  1.857134e-10
2202  Rv2190c -1.686927  1.865292e-14  2.414517e-12
Figure(640x480)
Figure(640x480)
