Permalink
Browse files

merge is_this_sra_16s.rb

  • Loading branch information...
2 parents 16797cc + 6d81843 commit b34b38423d7036991d5d3ccaddcbf29544b333de Ben J. Woodcroft committed Jul 17, 2012
Showing with 99 additions and 121 deletions.
  1. +99 −0 sralites_to_acacia.rb
  2. +0 −121 sralites_to_clusters.rb
View
@@ -0,0 +1,99 @@
+#!/usr/bin/env ruby
+
+require 'optparse'
+require 'bio-logger'
+require 'progressbar'
+
+$LOAD_PATH.unshift('/home/ben/git/bioruby-sra_fastq_dumper/lib')
+require 'bio-sra_fastq_dumper'
+
+SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
+
+# Parse command line options into the options hash
+options = {
+ :logger => 'stdout',
+}
+o = OptionParser.new do |opts|
+ opts.banner = "
+ Usage: #{SCRIPT_NAME} <arguments>
+
+ Take a directory of .lite.sra files, and correct them using acacia, outputing the results in a second directory filled with directories (one for each sra lite file) \n\n"
+
+ opts.on("-i", "--input-directory DIRECTORY", "a directory full of SRA .lite.sra files [required]") do |arg|
+ options[:in_directory] = File.absolute_path(arg)
+ end
+ opts.on("-o", "--output-directory DIRECTORY", "a directory to put output files in [required]") do |arg|
+ options[:out_directory] = File.absolute_path(arg)
+ end
+
+ # logger options
+ opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
+ Bio::Log::CLI.trace('error')
+ end
+ opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name |
+ options[:logger] = name
+ end
+ opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s |
+ Bio::Log::CLI.trace(s)
+ end
+end
+o.parse!
+if ARGV.length != 0 or options[:in_directory].nil? or options[:out_directory].nil?
+ $stderr.puts o
+ exit 1
+end
+# Setup logging. bio-logger defaults to STDERR not STDOUT, I disagree
+Bio::Log::CLI.logger(options[:logger]); log = Bio::Log::LoggerPlus.new(LOG_NAME); Bio::Log::CLI.configure(LOG_NAME)
+
+
+# Collect a list of the input sra lite files to process, advise
+log.debug "Inspecting #{options[:in_directory]}"
+sralites = Dir.entries(options[:in_directory]).select{|s| s.match(/\.lite\.sra$/)}
+log.info "Found #{sralites.length} SRA lite files in #{options[:in_directory]}"
+
+# Setup progress bar
+progress = ProgressBar.new('acacia', sralites.length)
+
+# For each input file,
+sralites.each do |sralite|
+ # convert to a fastq file in a temporary directory
+ Bio::FastqDumper.dump_in_tmpdir(File.join options[:in_directory], sralite) do |fastq|
+ # mkdir and cd to an aptly named direct in the output folder
+ outdir = File.join(options[:out_directory], sralite)
+ if File.exist?(outdir)
+ log.info "Skipping SRA file #{sralite}, since it already appears to be processed (output directory exists)"
+ next
+ end
+ log.debug "Creating directory #{options[:out_directory]}"
+ Dir.mkdir outdir
+ # Dir.chdir options[:in_directory] #acacia cannot handle the fastq file not being in the current directory, I think
+
+ # Run acacia and output the files into the working directory
+ f =
+ acacia_command = [
+ 'java',
+ '-jar',
+ #'/srv/whitlam/home/projects/DATA_MODELLING/acacia-1.50.b04.jar'
+ '/home/ben/bioinfo/acacia/acacia-1.50.b05.jar',
+ %w(-DFASTA=FALSE -DFASTQ=TRUE),
+ "-DOUTPUT_DIR=#{outdir}",
+ %w(-DOUTPUT_PREFIX=acacia -DMID_FILE=null),
+ "-DFASTQ_LOCATION=#{File.absolute_path fastq}"
+ ].flatten
+ log.debug "Running acacia with '#{acacia_command}'"
+ Bio::Command.call_command_open3(acacia_command) do |stdin, stdout, stderr|
+ err = stderr.read
+ raise err if err != ''
+ p stdout
+ end
+ puts `ls`
+
+ progress.inc
+ end
+end
+progress.finish
+
+
+
+
+
View
@@ -1,121 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'optparse'
-require 'bio-logger'
-
-$LOAD_PATH.unshift('/home/ben/git/bioruby-sra_fastq_dumper/lib')
-require 'bio-sra_fastq_dumper'
-
-SCRIPT_NAME = File.basename(__FILE__); LOG_NAME = SCRIPT_NAME.gsub('.rb','')
-
-# Parse command line options into the options hash
-options = {
- :logger => 'stderr',
- :sra_run_folder => '.',#"#{raise}",
- :target_folder => '.',#"#{raise}"
-}
-o = OptionParser.new do |opts|
- #TODO Fill in usage, description and option parsing below
- opts.banner = "
- Usage: #{SCRIPT_NAME} <arguments>
-
- Description of what this program does...\n"
- # Example option
- opts.on("-e", "--eg", "description [default: #{options[:eg]}]") do |f|
- options[:operation] = OVERALL
- end
-
- # logger options
- opts.on("-q", "--quiet", "Run quietly, set logging to ERROR level [default INFO]") do |q|
- Bio::Log::CLI.trace('error')
- end
- opts.on("--logger filename",String,"Log to file [default #{options[:logger]}]") do | name |
- options[:logger] = name
- end
- opts.on("--trace options",String,"Set log level [default INFO]. e.g. '--trace debug' to set logging level to DEBUG") do | s |
- Bio::Log::CLI.trace(s)
- end
-end
-o.parse!
-if ARGV.length != 0
- $stderr.puts o
- exit 1
-end
-# Setup logging
-Bio::Log::CLI.logger(options[:logger]) #bio-logger defaults to STDERR not STDOUT, I disagree
-log = Bio::Log::LoggerPlus.new(LOG_NAME)
-Bio::Log::CLI.configure(LOG_NAME)
-
-
-
-
-
-# Iterate over each run in the run folder
-Dir.foreach(options[:sra_run_folder]) do |entry|
- p entry
- next if %w(. ..).include?(entry)
-
- if entry.match(/.sra$/)
- log.debug "Processing #{entry}"
- else
- log.debug "Ignoring non sra-file #{entry}"
- next
- end
-
- # Create a new temporary folder using the ruby plugin
- Bio::FastqDumper.dump_in_tmpdir(entry) do |fastq|
- # run acacia
- command = [
- 'java',
- '-jar',
- #'/srv/whitlam/home/projects/DATA_MODELLING/acacia-1.50.b04.jar'
- '/home/ben/bioinfo/acacia/acacia-1.50.b05.jar',
- %w(-DFASTA=FALSE -DFASTQ=TRUE),
- "-DOUTPUT_DIR=#{Dir.getwd}",
- %w(-DOUTPUT_PREFIX=acacia -DMID_FILE=null),
- "-DFASTQ_LOCATION=#{fastq}"
- ].flatten
- log.debug "Running acacia with '#{command}'"
- puts `ls`
- Bio::Command.call_command_open3(command) do |stdin, stdout, stderr|
- err = stderr.read
- raise err if err != ''
- p stdout
- end
- puts `ls`
-
- # run uclust, saving the results to a new file in the target folder.
- # sort the sequences using uclust
- command = [
- 'usearch',
- '-sort',
- "acacia_#{fastq}.fa",
- '-output',
- 'seqs.sorted.fasta',
- ]
- Bio::Command.call_command_open3(command) do |stdin, stdout, stderr|
- err = stderr.read
- raise err if err != ''
- end
- puts `ls`
-
- # cluster the sequences with uclust
- command = [
- 'usearch',
- '-cluster',
- 'seqs.sorted.fa',
- '-output',
- '/tmp/ta.uc'
- ]
- Bio::Command.call_command_open3(command) do |stdin, stdout, stderr|
- err = stderr.read
- raise err if err != ''
- end
- puts `ls`
- end
-end
-
-
-
-
-

0 comments on commit b34b384

Please sign in to comment.