Skip to content
Browse files

Added Compressing and Indexing

Updated documentation and changed Index class to TFile
  • Loading branch information...
1 parent cd165d9 commit ab30617c460040354bd6d6e71de03b7af21fe959 @throwern committed May 4, 2012
Showing with 289 additions and 103 deletions.
  1. +33 −13 README.rdoc
  2. +1 −1 VERSION
  3. +3 −6 bio-tabix.gemspec
  4. +2 −1 ext/tabix/Rakefile
  5. +4 −4 ext/tabix/mkrf_conf.rb
  6. +1 −1 lib/bio-tabix.rb
  7. +73 −22 lib/bio/tabix/binding.rb
  8. +0 −54 lib/bio/tabix/index.rb
  9. +14 −1 lib/bio/tabix/library.rb
  10. +158 −0 lib/bio/tabix/t_file.rb
View
46 README.rdoc
@@ -5,26 +5,46 @@ http://samtools.sourceforge.net/
Tabix provides utilities for indexing and subsequently querying regions of interest from large tab delimited files.
-Files are indexed on three columns: [Group, pos1, pos2] and must be position sorted
+Files are indexed on three columns: [group, pos1, pos2] and must be position sorted
-== Usage
+This gem was modeled on the bio-samtools gem: https://github.com/helios/bioruby-samtools
-Open the file, an index will be created if it does not exist. Use :force => true to overwrite an existing index.
+== Installation
+gem install bio-tabix
- tabix_file = Bio::Tabix::Index.open(my_txt_file, {:s => group_col, :b => pos1_col, :e => pos2_col})
+== Usage
+Compress the file::
+ The following command will run the bgzip utility.
+ You must supply input and output filenames. If you don't compress your
+ data first, it will be compressed when the file is opened appending '.bgzf' to the filename.
+ Bio::Tabix::TFile.compress(in_file,compressed_file)
+Index the file::
+ Build an index by supplying the required columns.
+ If you don't create an index the open method will use the default options.
+ The opts parameter take a hash of index build options
+ - :s => sequence/group column [1]
+ - :b => beginning range column [2]
+ - :e => ending range column. Can equal :b. [3]
+ - :meta_char => comment character [#]
+ - :line_skip => number of initial lines to ignore [0]
+ tabix_file = Bio::Tabix::TFile.build_index(compressed_file, {:s => group_col, :b => pos1_col, :e => pos2_col})
-Create a proc or lambda. This will be called with the value of each fileline
- my_proc = lambda do |line|
- # convert text to array and print column 7
- puts line.split("\t")[6]
- end
-Process a region
- tabix_file.process_region(group_name, pos1, pos2, my_proc)
+Open the file::
+ Create a new TFile instance and open it.
+ tabix_file = Bio::Tabix::TFile.open(compressed_file)
-== Installation
-'gem install bio-samtools'
+Create a proc or lambda::
+ This will be called with the value of each fileline
+ my_func = lambda do |line|
+ # convert text to array and print column 6
+ puts line.split("\t")[6]
+ end
+
+Process a region::
+ Choose a group and range for your function and process it
+ tabix_file.process_region('FirstGroup', 1, 500, my_func)
== Dependencies:
-FFI (http://github.com/ffi/ffi)
View
2 VERSION
@@ -1 +1 @@
-0.1.0
+0.1.2
View
9 bio-tabix.gemspec
@@ -5,14 +5,13 @@
Gem::Specification.new do |s|
s.name = %q{bio-tabix}
- s.version = "0.1.0"
+ s.version = "0.1.2"
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
s.authors = ["throwern"]
- s.date = %q{2012-04-13}
+ s.date = %q{2012-05-04}
s.description = %q{Tabix file indexing routines from the samtools package http://samtools.sourceforge.net/}
s.email = %q{throwern@msu.edu}
- s.executables = ["bgzip", "tabix"]
s.extensions = ["ext/tabix/mkrf_conf.rb"]
s.extra_rdoc_files = [
"LICENSE.txt",
@@ -26,16 +25,14 @@ Gem::Specification.new do |s|
"README.rdoc",
"Rakefile",
"VERSION",
- "bin/bgzip",
- "bin/tabix",
"bio-tabix.gemspec",
"ext/tabix/Rakefile",
"ext/tabix/mkrf_conf.rb",
"lib/bio-tabix.rb",
"lib/bio/tabix/Version",
"lib/bio/tabix/binding.rb",
- "lib/bio/tabix/index.rb",
"lib/bio/tabix/library.rb",
+ "lib/bio/tabix/t_file.rb",
"test/helper.rb",
"test/test_bio-tabix.rb"
]
View
3 ext/tabix/Rakefile
@@ -1 +1,2 @@
-#placeholder
+placeholder
+
View
8 ext/tabix/mkrf_conf.rb
@@ -46,10 +46,10 @@
cp("libtabix.1.dylib","#{path_external}")
else raise NotImplementedError, "Tabix not supported on your platform"
end #case
- cp("tabix", "#{path}/../../bin/")
- chmod 0755, "#{path}/../../bin/tabix"
- cp("bgzip", "#{path}/../../bin/")
- chmod 0755, "#{path}/../../bin/bgzip"
+ cp("tabix", "#{path}/../../lib/bio/tabix")
+ chmod 0755, "#{path}/../../lib/bio/tabix/tabix"
+ cp("bgzip", "#{path}/../../lib/bio/tabix")
+ chmod 0755, "#{path}/../../lib/bio/tabix/bgzip"
end #cd
end
View
2 lib/bio-tabix.rb
@@ -1,2 +1,2 @@
require 'ffi'
-require 'bio/tabix/index'
+require 'bio/tabix/t_file'
View
95 lib/bio/tabix/binding.rb
@@ -1,49 +1,100 @@
-require 'bio/tabix/library'
+# == binding.rb
+# This file contains the ffi binding declarations for the tabix api
+# See https://github.com/ffi/ffi and http://samtools.sourceforge.net/tabix.shtml for details
+#
+# == Contact
+#
+# Author:: Nicholas A. Thrower
+# Copyright:: Copyright (c) 2012 Nicholas A Thrower
+# License:: See LICENSE.txt for more details
+#
+
+# -
module Bio
+ # -
module Tabix
+ # Ruby binding for the tabix file indexing routines within the samtools package http://samtools.sourceforge.net/
module Binding
+ require 'bio/tabix/library'
extend FFI::Library
ffi_lib Bio::Tabix::Library.filename
# CLASSES
+
+ # Custom string storage
+ # member of the IterT class
class KString < FFI::Struct
layout(
:l,:size_t,
:m,:size_t,
- :s,:string)
+ :s,:string
+ )
end
+ # File pointers to text and index data
+ # created by ti_open
+ # used by ti_read, ti_query, ti_close
class TabixT < FFI::Struct
layout(
- :fp,:pointer,
- :idx,:pointer,
- :fn,:string,
- :fnidx,:string
+ :fp, :pointer,
+ :idx, :pointer,
+ :fn, :string,
+ :fnidx, :string
)
end
+ # Iteratator for monitoring the query progress
+ # created by the ti_query method
+ # used by ti_read
class IterT < FFI::Struct
layout(
:from_first,:int,
- :tid,:int,
- :beg,:int,
- :end,:int,
- :n_off,:int,
- :i,:int,
- :finished,:int,
- :curr_off,:uint64,
- :str,KString,
- :idx,:pointer,
- :off,:pointer)
+ :tid, :int,
+ :beg, :int,
+ :end, :int,
+ :n_off, :int,
+ :i, :int,
+ :finished, :int,
+ :curr_off, :uint64,
+ :str, KString,
+ :idx, :pointer,
+ :off, :pointer
+ )
end
-
- # FUNCTIONS
- attach_function :ti_open, [:string, :string], :pointer # filename, idxname (or 0) : TabixT*
- attach_function :ti_read, [:pointer, :pointer, :pointer], :string # TabixT*, ti_iter_t, len : string
- attach_function :ti_query, [:pointer,:string,:int,:int], IterT # TabixT*, name, beg, end : IterT
+ # Index configuration
+ # used by ti_index_build2
+ class ConfT < FFI::Struct
+ layout(
+ :preset, :int32,
+ :sc, :int32,
+ :bc, :int32,
+ :ec, :int32,
+ :meta_char, :int32,
+ :line_skip, :int32
+ )
+ # convenience method to access attributes
+ def get_hash
+ {
+ :preset => self[:preset],
+ :sc => self[:sc],
+ :bc => self[:bc],
+ :ec => self[:ec],
+ :meta_char => self[:meta_char],
+ :line_skip => self[:line_skip]
+ }
+ end
+ end
+ # FUNCTIONS # PARAMETER(S) : RETURN
+ attach_function :ti_open, [:string, :string], :pointer # filename, idxname (or 0) : TabixT*
+ attach_function :ti_read, [:pointer, :pointer, :pointer], :string # TabixT*, ti_iter_t, len : string
+ attach_function :ti_query, [:pointer,:string,:int,:int], IterT # TabixT*, name, beg, end : IterT
attach_function :ti_close, [:pointer], :void # TabixT*
attach_function :ti_iter_destroy, [IterT], :void # ti_iter_t
-
+ attach_function :ti_index_build2, [:string,:pointer,:string], :int # filename, ti_conf_t, idxname (or 0) : 0/-1
+ attach_function :bgzf_is_bgzf, [:string], :int # filename, : 1/0
+ attach_function :ti_seqname, [:pointer,:pointer],:pointer # ti_index_t*, int*(count) : char**
+ attach_function :ti_index_load,[:string],:pointer # filename(no idx suffix) : ti_index_t*
+ attach_function :ti_get_conf,[:pointer],:pointer # ti_index_t* : ti_conf_t*
end
end
end
View
54 lib/bio/tabix/index.rb
@@ -1,54 +0,0 @@
-require 'bio/tabix/binding'
-
-module Bio
- module Tabix
- class Index
- include Bio::Tabix::Binding
- attr_accessor :file,:index,:t_file,:t_file_p
-
- def self.build(f,opts={})
- end
-
- def self.open(*args)
- self.new(*args).open
- end
-
- def initialize(f,opts={})
- @file = f
- @index = opts[:i]||file+".tbi"
- return self
- end
-
- def open
- if(@t_file)
- self.close
- end
- raise "FileNotFound #{file}" unless(File.exist?(file)) or file =~ /http:\/\/|ftp:\/\//
- raise "FileNotFound #{index} -- use -i to supply custom index" unless(File.exist?(index)) or index =~ /http:\/\/|ftp:\/\//
- @t_file_p = ti_open(file,index)
- raise "FileAcessError #{file}" if @t_file_p.null?
- @t_file = TabixT.new(@t_file_p)
- return self
- end
-
- def close
- if(@t_file_p)
- begin
- ti_close(@t_file_p)
- rescue
- puts "Error closing file"
- end
- end
- end
-
- def process_region(group,pos1,pos2,user_proc)
- iter = IterT.new(ti_query(t_file_p,group,pos1,pos2))
- len = FFI::MemoryPointer.new(:int)
- while( (s = ti_read(t_file_p, iter, len)) )
- user_proc.call(s,len)
- end
- ti_iter_destroy(iter)
- end
- end
- end
-end
View
15 lib/bio/tabix/library.rb
@@ -1,6 +1,20 @@
+# == library.rb
+# This file contains the Library Class for retrieving platform specific library names
+#
+# == Contact
+#
+# Author:: Nicholas A. Thrower
+# Copyright:: Copyright (c) 2012 Nicholas A Thrower
+# License:: See LICENSE.txt for more details
+#
+
+# -
module Bio
+ # -
module Tabix
+ # Cross-platform library naming
class Library
+ # return the platform specific library name
def self.filename
lib_os = case RUBY_PLATFORM
when /linux/
@@ -19,7 +33,6 @@ def self.filename
File.join(File.expand_path(File.dirname(__FILE__)),"libtabix.#{lib_os}")
end
- #module_function :filename
end
end
end
View
158 lib/bio/tabix/t_file.rb
@@ -0,0 +1,158 @@
+# == t_file.rb
+# This file contains the TFile class used to interact with the tabix api
+#
+# == Contact
+#
+# Author:: Nicholas A. Thrower
+# Copyright:: Copyright (c) 2012 Nicholas A Thrower
+# License:: See LICENSE.txt for more details
+#
+
+# -
+module Bio
+ # -
+ module Tabix
+ # The TFile class manages compressing, indexing, opening and parsing tab delimited files.
+ # The file must be position sorted prior to indexing.
+ class TFile
+ require 'bio/tabix/binding'
+ include Bio::Tabix::Binding
+ # ascii or compressed file name
+ attr_accessor :file
+ # index name
+ attr_accessor :index
+ # TabixT created from open index
+ attr_accessor :t_file
+ # pointer to TabixT
+ attr_accessor :t_file_p
+ # index build options
+ attr_accessor :options
+ # compresses the fi into fo using bgzip
+ def self.compress(fi, fo)
+ `#{File.join(File.expand_path(File.dirname(__FILE__)),'bgzip')} -c #{fi} > #{fo}`
+ end
+ # Builds an index from the supplied filename and options
+ # - :s => sequence/group column [1]
+ # - :b => beginning range column [2]
+ # - :e => ending range column. Can equal :b. [3]
+ # - :meta_char => comment character [#]
+ # - :line_skip => number of initial lines to ignore [0]
+ def self.build_index(f, opts={})
+ conf = ConfT.new
+ conf[:preset]=0
+ conf[:sc]=opts[:s] || 1
+ conf[:bc]=opts[:b] || 2
+ conf[:ec]=opts[:e] || 3
+ conf[:meta_char]=('#'||opts[:c][0]).ord
+ conf[:line_skip]=(0||opts[:S]).to_i
+ unless(Bio::Tabix::Binding.bgzf_is_bgzf(f)==1)
+ puts "Compressing..."
+ self.class.compress(f,f+".bgzf")
+ f=f+".bgzf"
+ end
+ puts "Indexing with #{conf.get_hash}..."
+ Bio::Tabix::Binding.ti_index_build2(f,conf,f+".tbi")
+ end
+ # convenience method to create a new Tabix instance and open it.
+ def self.open(*args)
+ self.new(*args).open
+ end
+ # Returns a new TFile. If the file is not compressed, a new compressed
+ # file will be created with compress[compress]. If the index is not present
+ # a new index will be created with build_index[build_index].
+ def initialize(f, opts={})
+ @file = f
+ @options = opts
+ @index = file+".tbi"
+ return self
+ end
+ # opens the file checking for compression and corresponding index.
+ def open
+ # check existing
+ if(@t_file)
+ puts "Already open, closing and re-opening"
+ self.close
+ end
+ # check datafile
+ if file =~ /http:\/\/|ftp:\/\//
+ puts "Expecting remote file: #{file}"
+ else
+ raise "FileNotFound #{file}" unless(File.exist?(file))
+ unless(bgzf_is_bgzf(file)==1)
+ unless(bgzf_is_bgzf(file+".bgzf")==1)
+ puts "Input does not look like a bgzip compressed file. Attempting compression..."
+ self.class.compress(file,file+".bgzf")
+ end
+ @file = file+".bgzf"
+ end
+ end
+ # check index
+ if index =~ /http:\/\/|ftp:\/\//
+ puts "Expecting remote index: #{index}"
+ elsif !File.exist?(index)
+ puts "Index #{index} not found. Building..."
+ self.class.build_index(file,options)
+ end
+ # open
+ @t_file_p = ti_open(file,index)
+ raise "FileAcessError #{file}" if @t_file_p.null?
+ @t_file = TabixT.new(@t_file_p)
+ return self
+ end
+ # closes the TabixT file
+ def close
+ if(@t_file_p)
+ begin
+ ti_close(@t_file_p)
+ @t_file_p = nil
+ rescue
+ puts "Error closing file"
+ end
+ end
+ end
+ # returns an array of the group names found in the index
+ def groups
+ load_index
+ g_num = FFI::MemoryPointer.new(:int)
+ g_ptr = ti_seqname(t_file[:idx],g_num)
+ return [] if g_ptr.null? || g_num.null?
+ g_ptr.get_array_of_string(0, g_num.read_int).compact
+ end
+ # returns the header (skipped lines + comments)
+ def header
+ load_index
+ conf = ConfT.new(ti_get_conf(t_file[:idx]))
+ iter = IterT.new(ti_query(t_file_p,nil,0,1))
+ len = FFI::MemoryPointer.new(:int)
+ str = ""
+ while( (s = ti_read(t_file_p, iter, len)) )
+ break if(s[0].ord != conf[:meta_char])
+ str << s
+ str << "\n"
+ end
+ ti_iter_destroy(iter)
+ @header = str
+ end
+ # Iterates over the supplied region calling user_proc on each item
+ # a region is defined by a group name and range(pos1 - pos2)
+ # all overlapping intervals within the group will be processed in order
+ def process_region(group, pos1, pos2, user_proc)
+ iter = IterT.new(ti_query(t_file_p,group,pos1,pos2))
+ len = FFI::MemoryPointer.new(:int)
+ while( (s = ti_read(t_file_p, iter, len)) )
+ user_proc.call(s,len)
+ end
+ ti_iter_destroy(iter)
+ end
+
+ private
+ def load_index
+ if t_file[:idx].null?
+ t_file[:idx] = ti_index_load(t_file[:fn])
+ end
+ raise "Index Load Error" if t_file[:idx].null?
+ end
+
+ end#Index class
+ end#Tabix module
+end#Bio module

0 comments on commit ab30617

Please sign in to comment.
Something went wrong with that request. Please try again.