Permalink
Browse files

initial import of code from the netherworld, no worky yet

  • Loading branch information...
1 parent 7738c15 commit 217e22b3a5b2d7e17bdb010c6a8fc9285a7a804e @wwood committed Mar 24, 2011
Showing with 514 additions and 0 deletions.
  1. +138 −0 bin/tm_hmm_wrapper.rb
  2. +1 −0 lib/bio-tm_hmm.rb
  3. +175 −0 lib/bio/transmembrane.rb
  4. +75 −0 test/test_tm_hmm_wrapper.rb
  5. +125 −0 test/test_transmembrane.rb
View
@@ -0,0 +1,138 @@
+#!/usr/bin/env ruby
+
+$:.unshift File.join(File.dirname(__FILE__),'..','vendor/plugins/ben_bioinformatics/lib')
+
+require 'rubygems'
+require 'transmembrane'
+include Transmembrane
+require 'tempfile'
+gem 'rio'
+require 'rio'
+
+class TmHmmWrapper
+
+ # Given an amino acid sequence, return a TransmembraneProtein
+ # made up of the predicted transmembrane domains
+ def calculate(sequence)
+ rio(:tempdir) do |d|
+ FileUtils.cd(d.to_s) do
+ Tempfile.open('tmhmmin') { |tempfilein|
+ # Write a fasta to the tempfile
+ tempfilein.puts '>wrapperSeq'
+ tempfilein.puts "#{sequence}"
+ tempfilein.close #required. Maybe because it doesn't flush otherwise?
+
+ Tempfile.open('signalpout') {|out|
+ result = system("tmhmm -short #{tempfilein.path} >#{out.path}")
+
+ if !result
+ raise Exception, "Running TMHMM program failed. See $? for details."
+ end
+
+
+ line = rio(out.path).readline
+ return TmHmmResult.create_from_short_line(line)
+ }
+ }
+ end
+ end
+ end
+end
+
+
+class TmHmmResult
+ attr_reader :domains
+
+ # initialise with the output line of a
+ # eg.
+ #PFF0290w len=293 ExpAA=145.77 First60=20.51 PredHel=7 Topology=o39-61i101-120o140-162i169-186o196-218i230-252o262-284i
+ def self.create_from_short_line(line)
+ protein = OrientedTransmembraneDomainProtein.new
+
+ splits = line.strip.split("\t")
+ if splits.length != 6
+ raise Exception, "Incorrectly parsed short line from TMHMM: #{line}"
+ end
+
+ substrate = splits[5]
+ if substrate.gsub!(/^Topology\=[io]/,'').nil?
+ raise Exception, "Badly parsed Topology hit: #{substrate}"
+ end
+
+ matches = substrate.match('^(\d+?)\-')
+ if !matches
+ return protein #no transmembrane domains predicted
+ end
+
+ # eat the string from the beginning adding the transmembrane domains
+ prev = matches[1]
+ substrate.gsub!(/^(\d+?)-/,'')
+ # match all the middle bits
+ reg = /^(\d+?)([io])(\d+?)\-/
+ while matches =substrate.match(reg)
+ tmd = OrientedTransmembraneDomain.new
+ tmd.start = prev.to_i
+ tmd.stop = matches[1].to_i
+ tmd.orientation = parse_orientation_from_last_location(matches[2])
+ protein.push tmd
+
+ prev = matches[3]
+ substrate.gsub!(reg, '')
+ end
+ #match the last bit
+ if !(matches = substrate.match('(\d+?)([io])$'))
+ raise Exception, "Failed to parse the last bit of: #{substrate}"
+ end
+ tmd = OrientedTransmembraneDomain.new
+ tmd.start = prev.to_i
+ tmd.stop = matches[1].to_i
+ tmd.orientation = parse_orientation_from_last_location(matches[2])
+ protein.push tmd
+
+ return protein
+ end
+
+ def self.parse_orientation_from_last_location(last_location)
+ case last_location
+ when 'i'
+ return OrientedTransmembraneDomain::OUTSIDE_IN
+ when 'o'
+ return OrientedTransmembraneDomain::INSIDE_OUT
+ else
+ raise Exception, "Badly parsed topology hit due to orientation character: #{substrate}"
+ end
+ end
+end
+
+
+# If being run directly instead of being require'd,
+# output one transmembrane per line, and
+# indicate that a particular protein has no transmembrane domain
+if $0 == __FILE__
+ require 'bio'
+
+ runner = TmHmmWrapper.new
+
+ Bio::FlatFile.auto(ARGF).each do |seq|
+ result = runner.calculate(seq.seq)
+ name = seq.definition
+
+ if result.has_domain?
+ # At least one TMD found. Output each on a separate line
+ result.transmembrane_domains.each do |tmd|
+ puts [
+ name,
+ result.transmembrane_type,
+ tmd.start,
+ tmd.stop,
+ tmd.orientation
+ ].join("\t")
+ end
+ else
+ puts [
+ name,
+ 'No Transmembrane Domain Found'
+ ].join("\t")
+ end
+ end
+end
View
@@ -0,0 +1 @@
+require 'lib/bio/transmembrane'
View
@@ -0,0 +1,175 @@
+# a simple class to represent a TMD
+
+require 'array_pair'
+
+module Bio
+ module Transmembrane
+ class TransmembraneProtein
+ attr_accessor :transmembrane_domains, :name
+ include Enumerable #so each, each_with_index, etc. work
+
+ def initialize
+ # default no domains to empty array not nil
+ @transmembrane_domains = []
+ end
+
+ def push(transmembrane_domain)
+ @transmembrane_domains.push transmembrane_domain
+ end
+
+ def average_length
+ @transmembrane_domains.inject(0){|sum,cur| sum+cur.length}.to_f/@transmembrane_domains.length.to_f
+ end
+
+ def minimum_length
+ @transmembrane_domains.min.length
+ end
+
+ def maximum_length
+ @transmembrane_domains.max.length
+ end
+
+ def has_domain?
+ !@transmembrane_domains.empty?
+ end
+
+ def multiple_transmembrane_domains?
+ @transmembrane_domains.length > 1
+ end
+
+ def overlaps(another_transmembrane_protein)
+ @transmembrane_domains.pairs(another_transmembrane_protein.transmembrane_domains).collect {|t1,t2|
+ t1.intersection(t2) == () ? nil : [t1,t2]
+ }.reject {|a| a.nil?}
+ end
+
+ # return the pair of transmembrane domains that overlaps the best (ie for the longest period)
+ def best_overlap(another_transmembrane_protein)
+ max = @transmembrane_domains.pairs(another_transmembrane_protein.transmembrane_domains).collect {|t1,t2|
+ [t1.overlap_length(t2), [t1,t2]]
+ }.max {|a,b| a[0] <=> b[0]}
+ max[0] == 0 ? nil : max[1]
+ end
+
+ def each
+ @transmembrane_domains.each{|t| yield t}
+ end
+ end
+
+ class OrientedTransmembraneDomainProtein<TransmembraneProtein
+ def transmembrane_type_1?
+ @transmembrane_domains and @transmembrane_domains.length == 1 and @transmembrane_domains[0].orientation == OrientedTransmembraneDomain::OUTSIDE_IN
+ end
+
+ def transmembrane_type_2?
+ @transmembrane_domains and @transmembrane_domains.length == 1 and @transmembrane_domains[0].orientation == OrientedTransmembraneDomain::INSIDE_OUT
+ end
+
+ def transmembrane_type
+ if transmembrane_type_1?
+ return 'I'
+ elsif transmembrane_type_2?
+ return 'II'
+ else
+ return 'Unknown'
+ end
+ end
+ end
+
+ class TransmembraneDomainDefinition
+ attr_accessor :start, :stop
+
+ # A new TMD. The length is stop-start+1, so start and stop are
+ # 'inclusive'
+ def initialize(start=nil, stop=nil)
+ @start = start
+ @stop = stop
+ end
+
+ def length
+ @stop-@start+1
+ end
+
+ def <=>(other)
+ length <=> other.length
+ end
+
+ def ==(other)
+ start == other.start and
+ stop == other.stop
+ end
+
+ def sequence(protein_sequence_string, nterm_offset=0, cterm_offset=0)
+ one = start+nterm_offset-1
+ one = 0 if one < 0
+ two = stop+cterm_offset-1
+ two = 0 if two < 0
+
+ protein_sequence_string[(one)..(two)]
+ end
+
+ # Return the number of amino acids that overlap with another
+ # transmembrane domain, or 0 if none are found
+ def overlap_length(another_transmembrane_domain_defintion)
+ intersection(another_transmembrane_domain_defintion).to_a.length
+ end
+
+ # Return a range representing the overlap of this transmembrane domain
+ # with another
+ #
+ # Code inspired by http://billsiggelkow.com/2008/8/29/ruby-range-intersection
+ def intersection(another_transmembrane_domain_defintion)
+ res = (@start..@stop).to_a & (another_transmembrane_domain_defintion.start..another_transmembrane_domain_defintion.stop).to_a
+ res.empty? ? nil : (res.first..res.last)
+ end
+ alias_method(:overlap, :intersection)
+ end
+
+ class ConfidencedTransmembraneDomain<TransmembraneDomainDefinition
+ attr_accessor :confidence
+
+ def <=>(other)
+ return start<=>other.start if start<=>other.start
+ return stop<=>other.start if stop<=>other.stop
+ return confidence <=> other.confidence
+ end
+
+ def ==(other)
+ start == other.start and
+ stop == other.stop and
+ confidence == other.confidence
+ end
+ end
+
+ class OrientedTransmembraneDomain<TransmembraneDomainDefinition
+ # The orientation can either be inside out (like a type II transmembrane domain protein)
+ INSIDE_OUT = 'inside_out'
+ # Or outside in, like a type I transmembrane domain protein)
+ OUTSIDE_IN = 'outside_in'
+ # or the whole protein is TMD, so orientation is unknown
+ UNKNOWN = 'unknown'
+
+ attr_accessor :orientation
+
+ def initialize(start=nil, stop=nil, orientation=nil)
+ @start = start.to_i unless start.nil?
+ @stop = stop.to_i unless stop.nil?
+ @orientation = orientation unless orientation.nil?
+ end
+ end
+
+ # A class to represent a protein with a signal peptide and a transmembrane
+ # domain
+ class SignalPeptideTransmembraneDomainProtein<OrientedTransmembraneDomainProtein
+ attr_accessor :signal_peptide
+
+ def signal?
+ !@signal_peptide.nil?
+ end
+ end
+
+ class SignalPeptide
+ attr_accessor :start, :stop
+ end
+ end
+end
@@ -0,0 +1,75 @@
+#
+# To change this template, choose Tools | Templates
+# and open the template in the editor.
+
+
+$:.unshift File.join(File.dirname(__FILE__),'..','lib')
+
+require 'test/unit'
+require 'tm_hmm_wrapper'
+require 'rubygems'
+require 'bio'
+require 'transmembrane'
+
+class TmHmmWrapperTest < Test::Unit::TestCase
+ include Transmembrane
+
+ def test_parser
+ result = TmHmmResult.create_from_short_line('PFA0635c len=555 ExpAA=0.00 First60=0.00 PredHel=0 Topology=o')
+ assert result
+ assert_equal false, result.has_domain?
+
+ # test a single TMD
+ result = TmHmmResult.create_from_short_line('PFA0685c len=324 ExpAA=20.36 First60=0.00 PredHel=1 Topology=o281-303i')
+ assert result
+ assert_equal 1, result.transmembrane_domains.length
+ assert_equal 281, result.transmembrane_domains[0].start
+ assert_equal 303, result.transmembrane_domains[0].stop
+ assert_equal OrientedTransmembraneDomain::OUTSIDE_IN,
+ result.transmembrane_domains[0].orientation
+ assert result.transmembrane_type_1?
+ assert_equal false, result.transmembrane_type_2?
+
+ # test 2 TMD
+ result = TmHmmResult.create_from_short_line('PFA0680c len=209 ExpAA=43.03 First60=0.02 PredHel=2 Topology=i137-159o164-183i')
+ assert result
+ assert_equal 2, result.transmembrane_domains.length
+ assert_equal 137, result.transmembrane_domains[0].start
+ assert_equal 159, result.transmembrane_domains[0].stop
+ assert_equal OrientedTransmembraneDomain::INSIDE_OUT,
+ result.transmembrane_domains[0].orientation
+ assert_equal 164, result.transmembrane_domains[1].start
+ assert_equal 183, result.transmembrane_domains[1].stop
+ assert_equal OrientedTransmembraneDomain::OUTSIDE_IN,
+ result.transmembrane_domains[1].orientation
+ assert_equal false, result.transmembrane_type_1?
+ assert_equal false, result.transmembrane_type_2?
+
+ # test 3 TMD
+ result = TmHmmResult.create_from_short_line('PFA0705c len=282 ExpAA=90.97 First60=22.20 PredHel=4 Topology=i22-44o185-207i212-234o259-281i')
+ assert result
+ assert_equal 4, result.transmembrane_domains.length
+ assert_equal 22, result.transmembrane_domains[0].start
+ assert_equal 44, result.transmembrane_domains[0].stop
+ assert_equal OrientedTransmembraneDomain::INSIDE_OUT,
+ result.transmembrane_domains[0].orientation
+ assert_equal 185, result.transmembrane_domains[1].start
+ assert_equal 207, result.transmembrane_domains[1].stop
+ assert_equal OrientedTransmembraneDomain::OUTSIDE_IN,
+ result.transmembrane_domains[1].orientation
+ assert_equal 259, result.transmembrane_domains[3].start
+ assert_equal 281, result.transmembrane_domains[3].stop
+ assert_equal OrientedTransmembraneDomain::OUTSIDE_IN,
+ result.transmembrane_domains[3].orientation
+ assert_equal false, result.transmembrane_type_1?
+ assert_equal false, result.transmembrane_type_2?
+ end
+
+ def test_wrapper
+ prog = TmHmmWrapper.new
+ seq = Bio::FlatFile.auto('testFiles/falciparum1.fa').next_entry
+ tmp = prog.calculate(seq.seq)
+ assert tmp
+ assert_equal false, tmp.has_domain?
+ end
+end
Oops, something went wrong.

0 comments on commit 217e22b

Please sign in to comment.