From 6d818435bed76adbe63031a9601b6234e9786f22 Mon Sep 17 00:00:00 2001 From: Ben J Woodcroft Date: Mon, 16 Jul 2012 23:28:38 +1000 Subject: [PATCH] script to convert sra lite files to acacia --- is_this_sra_16s.rb | 2 +- sralites_to_acacia.rb | 99 ++++++++++++++++++++++++++++++++ sralites_to_clusters.rb | 121 ---------------------------------------- 3 files changed, 100 insertions(+), 122 deletions(-) create mode 100755 sralites_to_acacia.rb delete mode 100755 sralites_to_clusters.rb diff --git a/is_this_sra_16s.rb b/is_this_sra_16s.rb index 8ff669b..e20bab3 100755 --- a/is_this_sra_16s.rb +++ b/is_this_sra_16s.rb @@ -50,7 +50,7 @@ # convert it to a fastq file, taking the first 10 sequences, convert to fasta format, save as 10seqs.fa command = 'fastq-dump -Z \''+ #dump the lite sra file to fastq format sra_lite+ - '\' |head -n 400 |tail -n 40 '+ #first 40 lines equals the first 10 sequences, and don't take it from the start since they are probably low quality + '\' |head -n 40 '+ #first 40 lines equals the first 10 sequences '|awk \'{print ">" substr($0,2);getline;print;getline;getline}\' >'+tempfile.path+ #convert to fasta format ' && blastn -query '+tempfile.path+' -outfmt 6 -max_target_seqs 1 -db '+ #blast options[:ssu_database] # against a reduced set of 16S sequences from greengenes diff --git a/sralites_to_acacia.rb b/sralites_to_acacia.rb new file mode 100755 index 0000000..4223c03 --- /dev/null +++ b/sralites_to_acacia.rb @@ -0,0 +1,99 @@ +#!/usr/bin/env ruby + +require 'optparse' +require 'bio-logger' +require 'progressbar' + +$LOAD_PATH.unshift('/home/ben/git/bioruby-sra_fastq_dumper/lib') +require 'bio-sra_fastq_dumper' + +SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','') + +# Parse command line options into the options hash +options = { + :logger => 'stdout', +} +o = OptionParser.new do |opts| + opts.banner = " + Usage: #{SCRIPT_NAME} + + Take a directory of .lite.sra files, and correct them using acacia, outputing the results in a second directory filled with directories (one for each sra lite file) \n\n" + + opts.on("-i", "--input-directory DIRECTORY", "a directory full of SRA .lite.sra files [required]") do |arg| + options[:in_directory] = File.absolute_path(arg) + end + opts.on("-o", "--output-directory DIRECTORY", "a directory to put output files in [required]") do |arg| + options[:out_directory] = File.absolute_path(arg) + end + + # logger options + opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q| + Bio::Log::CLI.trace('error') + end + opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name | + options[:logger] = name + end + opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s | + Bio::Log::CLI.trace(s) + end +end +o.parse! +if ARGV.length != 0 or options[:in_directory].nil? or options[:out_directory].nil? + $stderr.puts o + exit 1 +end +# Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree +Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME) + + +# Collect a list of the input sra lite files to process, advise +log.debug "Inspecting #{options[:in_directory]}" +sralites = Dir.entries(options[:in_directory]).select{|s| s.match(/\.lite\.sra$/)} +log.info "Found #{sralites.length} SRA lite files in #{options[:in_directory]}" + +# Setup progress bar +progress = ProgressBar.new('acacia', sralites.length) + +# For each input file, +sralites.each do |sralite| + # convert to a fastq file in a temporary directory + Bio::FastqDumper.dump_in_tmpdir(File.join options[:in_directory], sralite) do |fastq| + # mkdir and cd to an aptly named direct in the output folder + outdir = File.join(options[:out_directory], sralite) + if File.exist?(outdir) + log.info "Skipping SRA file #{sralite}, since it already appears to be processed (output directory exists)" + next + end + log.debug "Creating directory #{options[:out_directory]}" + Dir.mkdir outdir + # Dir.chdir options[:in_directory] #acacia cannot handle the fastq file not being in the current directory, I think + + # Run acacia and output the files into the working directory + f = + acacia_command = [ + 'java', + '-jar', + #'/srv/whitlam/home/projects/DATA_MODELLING/acacia-1.50.b04.jar' + '/home/ben/bioinfo/acacia/acacia-1.50.b05.jar', + %w(-DFASTA=FALSE -DFASTQ=TRUE), + "-DOUTPUT_DIR=#{outdir}", + %w(-DOUTPUT_PREFIX=acacia -DMID_FILE=null), + "-DFASTQ_LOCATION=#{File.absolute_path fastq}" + ].flatten + log.debug "Running acacia with '#{acacia_command}'" + Bio::Command.call_command_open3(acacia_command) do |stdin, stdout, stderr| + err = stderr.read + raise err if err != '' + p stdout + end + puts `ls` + + progress.inc + end +end +progress.finish + + + + + diff --git a/sralites_to_clusters.rb b/sralites_to_clusters.rb deleted file mode 100755 index cec72af..0000000 --- a/sralites_to_clusters.rb +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env ruby - -require 'optparse' -require 'bio-logger' - -$LOAD_PATH.unshift('/home/ben/git/bioruby-sra_fastq_dumper/lib') -require 'bio-sra_fastq_dumper' - -SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','') - -# Parse command line options into the options hash -options = { - :logger => 'stderr', - :sra_run_folder => '.',#"#{raise}", - :target_folder => '.',#"#{raise}" -} -o = OptionParser.new do |opts| - #TODO Fill in usage, description and option parsing below - opts.banner = " - Usage: #{SCRIPT_NAME} - - Description of what this program does...\n" - # Example option - opts.on("-e", "--eg", "description [default: #{options[:eg]}]") do |f| - options[:operation] = OVERALL - end - - # logger options - opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q| - Bio::Log::CLI.trace('error') - end - opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name | - options[:logger] = name - end - opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s | - Bio::Log::CLI.trace(s) - end -end -o.parse! -if ARGV.length != 0 - $stderr.puts o - exit 1 -end -# Setup logging -Bio::Log::CLI.logger(options[:logger]) #bio-logger defaults to STDERR not STDOUT, I disagree -log = Bio::Log::LoggerPlus.new(LOG_NAME) -Bio::Log::CLI.configure(LOG_NAME) - - - - - -# Iterate over each run in the run folder -Dir.foreach(options[:sra_run_folder]) do |entry| - p entry - next if %w(. ..).include?(entry) - - if entry.match(/.sra$/) - log.debug "Processing #{entry}" - else - log.debug "Ignoring non sra-file #{entry}" - next - end - - # Create a new temporary folder using the ruby plugin - Bio::FastqDumper.dump_in_tmpdir(entry) do |fastq| - # run acacia - command = [ - 'java', - '-jar', - #'/srv/whitlam/home/projects/DATA_MODELLING/acacia-1.50.b04.jar' - '/home/ben/bioinfo/acacia/acacia-1.50.b05.jar', - %w(-DFASTA=FALSE -DFASTQ=TRUE), - "-DOUTPUT_DIR=#{Dir.getwd}", - %w(-DOUTPUT_PREFIX=acacia -DMID_FILE=null), - "-DFASTQ_LOCATION=#{fastq}" - ].flatten - log.debug "Running acacia with '#{command}'" - puts `ls` - Bio::Command.call_command_open3(command) do |stdin, stdout, stderr| - err = stderr.read - raise err if err != '' - p stdout - end - puts `ls` - - # run uclust, saving the results to a new file in the target folder. - # sort the sequences using uclust - command = [ - 'usearch', - '-sort', - "acacia_#{fastq}.fa", - '-output', - 'seqs.sorted.fasta', - ] - Bio::Command.call_command_open3(command) do |stdin, stdout, stderr| - err = stderr.read - raise err if err != '' - end - puts `ls` - - # cluster the sequences with uclust - command = [ - 'usearch', - '-cluster', - 'seqs.sorted.fa', - '-output', - '/tmp/ta.uc' - ] - Bio::Command.call_command_open3(command) do |stdin, stdout, stderr| - err = stderr.read - raise err if err != '' - end - puts `ls` - end -end - - - - -