Create how-to-reproduce-benchmark-results.txt

zhaoxiaofei · Dec 8, 2018 · d4e48e3 · d4e48e3
1 parent e4e704e
commit d4e48e3
Showing 1 changed file with 93 additions and 0 deletions.
diff --git a/how-to-reproduce-benchmark-results.txt b/how-to-reproduce-benchmark-results.txt
@@ -0,0 +1,93 @@
+# This document describes how to reproduce the results in the paper.
+
+
+# Before doing anything else, please compile the source code of bindash and mash into executables. Detail about how to compile them are all on github. 
+# After compiling, make sure that bindash and mash can be found in the system. For example, alias bindash=$HOME/bindash/release/bindash # please modify as needed
+# Then, change the current working directory to bindash/benchmark/
+
+
+# step 1: download only the 120 reference genomes in assembly_summary_2018032200.txt
+# we can use the script download-refseq.py for step 1.
+
+
+# step 2: download all the 110426 genomes in assembly_summary_2018032200.txt into an SSD
+# we can use the script download-refseq.py for step 2, but we have to slightly modify the code such that there is no check for the keyword 'reference genome'
+
+
+# step 3: for each set of files downloaded in step 1 and 2, create the file "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}"
+# Assuming seqs is the directory containing all the files downloaded from step 1 or 2.
+# The next six commented lines describe an example of the file content of "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" (the starting string "# " is excluded)
+# seqs/GCF_000003135.1_ASM313v1_genomic.fna.gz
+# seqs/GCF_000003215.1_ASM321v1_genomic.fna.gz
+# seqs/GCF_000003645.1_ASM364v1_genomic.fna.gz
+# <more lines, where each line indicates the filepath of the genomic fasta file of an assembly that we downloaded in step 1 or 2>
+# seqs/GCF_900291885.1_CF112_genomic.fna.gz
+# seqs/GCF_900291895.1_CF111_genomic.fna.gz
+
+
+# step 4: evaluate the precision computed by bindash and mash from the 120 genomes downloaded in step 1
+
+FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-refgenomes-raw.txt # please modify as needed
+SSD_DIR=/home/zhaoxiaofei/bindash/output/precision/ # please modify as needed
+
+PREFIX="${SSD_DIR}/assembly_bacteria_2018032201_refgenomes_"
+
+mkdir -p "${SSD_DIR}"
+
+# Please make sure that the system has at least 7GB of free RAM available before running the following two commands.
+# If the system does not have enough RAM, we can still use the following procedure to generate the same output file.
+#   1 - split FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE into multiple smaller files
+#   2 - sketch each smaller file, each smaller file can be used as either query-sketch or target-sketch
+#   3 - use bindash dist [options] query-sketch target-sketch for each combination of smaller query-sketch file and smaller target-sketch file
+#   4 - concatenate the result.tsv files from the previous combinations of query-sketch and target-sketch into one single output file.
+bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}groundtruth_nthreads8 --nthreads=8 --minhashtype=-1 # -1 means exact jaccard index
+bindash dist ${PREFIX}groundtruth_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}groundtruth_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.
+
+# 4.1: generate the root-mean-square errors (RMSEs) in Table 1 of the main text.
+
+bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 # use default --sketchsize64
+bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros.
+
+mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 # use default -s
+mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv
+
+bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash
+
+mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash
+
+echo "#Sketch-size Mash-RMSE BinDash-RMSE" > ${PREFIX}_table1.precision.txt
+echo "Default-param-value ${mashRMSE} ${bindashRMSE}" > ${PREFIX}_table1.precision.txt
+
+# 4.2: generate the root-mean-square errors (RMSEs) in Table 1 of the supplementary information.
+
+echo "#sketch-sizes Mash-RMSE BinDash-RMSE" > ${PREFIX}_suppTable1.precision.txt # this tsv file contains data for Table 1 in the supplementary information
+for n64bits in 1 2 4 8 16 32 64; do # the sequence 1 2 ... 64 corresponds to the sequence 64 128 ... 4096 in Table 1 in the supplementary information
+
+	bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 --sketchsize64="${n64bits}"
+	bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros
+
+	mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 -s $((${n64bits}*64))
+	mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv
+
+	bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash
+
+	mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash
+
+	echo "$((${n64bits}*64)) ${mashRMSE} ${bindashRMSE}" >> ${PREFIX}_suppTable1.precision.txt
+done
+
+# step 5: evaluate the runtime of bindash and mash from all the 110426 genomes downloaded in step 2. Time durations measured by the "time -p" command are used in Table 1 of the main text.
+
+FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-all-raw.txt # please modify as needed
+SSD_DIR=/home/zhaoxiaofei/bindash/output/ssd/all # please modify as needed
+
+PREFIX="${SSD_DIR}/ssd_assembly_bacteria_2018032201_"
+
+mkdir -p "${SSD_DIR}"
+
+(time -p bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8) 2>"${PREFIX}_table1.bindashSketchTime.txt" ;
+(time -p bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 > /dev/null) 2>"${PREFIX}_table1.bindashDistTime.txt" ; 
+# stdout redirection prevents having a huge output file
+
+(time -p mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8) 2>"${PREFIX}_table1.mashSketchTime.txt" ;
+(time -p mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > /dev/null) 2>"${PREFIX}_table1.mashDistTime.txt" ;