Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Create how-to-reproduce-benchmark-results.txt
- Loading branch information
Zhao, XiaoFei
committed
Dec 8, 2018
1 parent
e4e704e
commit d4e48e3
Showing
1 changed file
with
93 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
# This document describes how to reproduce the results in the paper. | ||
|
||
|
||
# Before doing anything else, please compile the source code of bindash and mash into executables. Detail about how to compile them are all on github. | ||
# After compiling, make sure that bindash and mash can be found in the system. For example, alias bindash=$HOME/bindash/release/bindash # please modify as needed | ||
# Then, change the current working directory to bindash/benchmark/ | ||
|
||
|
||
# step 1: download only the 120 reference genomes in assembly_summary_2018032200.txt | ||
# we can use the script download-refseq.py for step 1. | ||
|
||
|
||
# step 2: download all the 110426 genomes in assembly_summary_2018032200.txt into an SSD | ||
# we can use the script download-refseq.py for step 2, but we have to slightly modify the code such that there is no check for the keyword 'reference genome' | ||
|
||
|
||
# step 3: for each set of files downloaded in step 1 and 2, create the file "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" | ||
# Assuming seqs is the directory containing all the files downloaded from step 1 or 2. | ||
# The next six commented lines describe an example of the file content of "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" (the starting string "# " is excluded) | ||
# seqs/GCF_000003135.1_ASM313v1_genomic.fna.gz | ||
# seqs/GCF_000003215.1_ASM321v1_genomic.fna.gz | ||
# seqs/GCF_000003645.1_ASM364v1_genomic.fna.gz | ||
# <more lines, where each line indicates the filepath of the genomic fasta file of an assembly that we downloaded in step 1 or 2> | ||
# seqs/GCF_900291885.1_CF112_genomic.fna.gz | ||
# seqs/GCF_900291895.1_CF111_genomic.fna.gz | ||
|
||
|
||
# step 4: evaluate the precision computed by bindash and mash from the 120 genomes downloaded in step 1 | ||
|
||
FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-refgenomes-raw.txt # please modify as needed | ||
SSD_DIR=/home/zhaoxiaofei/bindash/output/precision/ # please modify as needed | ||
|
||
PREFIX="${SSD_DIR}/assembly_bacteria_2018032201_refgenomes_" | ||
|
||
mkdir -p "${SSD_DIR}" | ||
|
||
# Please make sure that the system has at least 7GB of free RAM available before running the following two commands. | ||
# If the system does not have enough RAM, we can still use the following procedure to generate the same output file. | ||
# 1 - split FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE into multiple smaller files | ||
# 2 - sketch each smaller file, each smaller file can be used as either query-sketch or target-sketch | ||
# 3 - use bindash dist [options] query-sketch target-sketch for each combination of smaller query-sketch file and smaller target-sketch file | ||
# 4 - concatenate the result.tsv files from the previous combinations of query-sketch and target-sketch into one single output file. | ||
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}groundtruth_nthreads8 --nthreads=8 --minhashtype=-1 # -1 means exact jaccard index | ||
bindash dist ${PREFIX}groundtruth_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}groundtruth_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros. | ||
|
||
# 4.1: generate the root-mean-square errors (RMSEs) in Table 1 of the main text. | ||
|
||
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 # use default --sketchsize64 | ||
bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros. | ||
|
||
mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 # use default -s | ||
mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv | ||
|
||
bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash | ||
|
||
mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash | ||
|
||
echo "#Sketch-size Mash-RMSE BinDash-RMSE" > ${PREFIX}_table1.precision.txt | ||
echo "Default-param-value ${mashRMSE} ${bindashRMSE}" > ${PREFIX}_table1.precision.txt | ||
|
||
# 4.2: generate the root-mean-square errors (RMSEs) in Table 1 of the supplementary information. | ||
|
||
echo "#sketch-sizes Mash-RMSE BinDash-RMSE" > ${PREFIX}_suppTable1.precision.txt # this tsv file contains data for Table 1 in the supplementary information | ||
for n64bits in 1 2 4 8 16 32 64; do # the sequence 1 2 ... 64 corresponds to the sequence 64 128 ... 4096 in Table 1 in the supplementary information | ||
|
||
bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8 --sketchsize64="${n64bits}" | ||
bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 --mthres=1e9 > ${PREFIX}bindash_nthreads8_result.tsv # with --mthres=1e9, bindash will report all pairwise distances, including zeros | ||
|
||
mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8 -s $((${n64bits}*64)) | ||
mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > ${PREFIX}mash_nthreads8_result.tsv | ||
|
||
bindashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}bindash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by bindash | ||
|
||
mashRMSE=$(python evaluate-output.py ${PREFIX}groundtruth_nthreads8_result.tsv ${PREFIX}mash_nthreads8_result.tsv | awk 'BEGIN {sumSqrErr = 0; countErr = 0; } { sumSqrErr += $2 * $2; countErr += 1 } END {print sqrt(sumSqrErr / countErr); }') # this is the RMSE (root-mean-square error) produced by mash | ||
|
||
echo "$((${n64bits}*64)) ${mashRMSE} ${bindashRMSE}" >> ${PREFIX}_suppTable1.precision.txt | ||
done | ||
|
||
# step 5: evaluate the runtime of bindash and mash from all the 110426 genomes downloaded in step 2. Time durations measured by the "time -p" command are used in Table 1 of the main text. | ||
|
||
FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE=/home/zhaoxiaofei/bindash/input/assembly_bacteria_2018032201/files-all-raw.txt # please modify as needed | ||
SSD_DIR=/home/zhaoxiaofei/bindash/output/ssd/all # please modify as needed | ||
|
||
PREFIX="${SSD_DIR}/ssd_assembly_bacteria_2018032201_" | ||
|
||
mkdir -p "${SSD_DIR}" | ||
|
||
(time -p bindash sketch --listfname="${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" --outfname=${PREFIX}bindash_nthreads8 --nthreads=8) 2>"${PREFIX}_table1.bindashSketchTime.txt" ; | ||
(time -p bindash dist ${PREFIX}bindash_nthreads8 --nthreads=8 > /dev/null) 2>"${PREFIX}_table1.bindashDistTime.txt" ; | ||
# stdout redirection prevents having a huge output file | ||
|
||
(time -p mash sketch -l "${FILE_CONTAINING_ONE_GENOME_FILENAME_PER_LINE}" -o ${PREFIX}mash_nthreads8 -p 8) 2>"${PREFIX}_table1.mashSketchTime.txt" ; | ||
(time -p mash dist ${PREFIX}mash_nthreads8.msh ${PREFIX}mash_nthreads8.msh -p 8 > /dev/null) 2>"${PREFIX}_table1.mashDistTime.txt" ; |