Permalink
Browse files

first tested code that parses ipcress output

  • Loading branch information...
1 parent 846ac3a commit 6513b8cc59acca6a925292dcdda09732a06081bd Ben J Woodcroft committed May 28, 2012
View
@@ -2,6 +2,7 @@ source "http://rubygems.org"
# Add dependencies required to use your gem here.
# Example:
# gem "activesupport", ">= 2.3.5"
+gem 'bio', '>=1.4.2'
# Add dependencies to develop your gem here.
# Include everything needed to run rake, tests, features, etc.
View
@@ -1,12 +1,3 @@
-# Please require your code below, respecting the naming conventions in the
-# bioruby directory tree.
-#
-# For example, say you have a plugin named bio-plugin, the only uncommented
-# line in this file would be
-#
-# require 'bio/bio-plugin/plugin'
-#
-# In this file only require other files. Avoid other source code.
-
-require 'bio-ipcress/ipcress.rb'
+require 'bio'
+require 'bio/appl/ipcress.rb'
@@ -1,3 +0,0 @@
-module BioIpcress
-
-end
View
@@ -0,0 +1,117 @@
+module Bio
+ class Ipcress
+ # A full Ipcress result looks something like this. Parse it into an array of Ipcress::Result objects
+ #
+ # ** Message: Loaded [1] experiments
+ #
+ # Ipcress result
+ # --------------
+ # Experiment: AE12_pmid21856836_16S
+ # Primers: A B
+ # Target: gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+ # Matches: 19/20 14/15
+ # Product: 502 bp (range 2-10000)
+ # Result type: forward
+ #
+ # ...AAACTTAAAGGAATTGGCGG......................... # forward
+ # ||||| ||| |||||| |||-->
+ # 5'-AAACTYAAAKGAATTGRCGG-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ # <--| |||||||||||||
+ # ..............................CGTGTGTGGCGGGCA... # revcomp
+ # --
+ # ipcress: gi|335929284|gb|JN048683.1|:filter(unmasked) AE12_pmid21856836_16S 502 A 826 1 B 1313 1 forward
+ # -- completed ipcress analysis
+ def self.parse(ipcress_output_string)
+ results = Results.new
+
+ ipcress_output_string.split(/\nIpcress result\n--------------\n/m).each_with_index do |result_chunk, index|
+ next if index == 0 # ignore the first chunk since that isn't a result
+
+ lines = result_chunk.split("\n").collect{|l| l.strip}
+
+ result = Result.new
+ i=0
+ result.experiment_name = lines[i].match(/^Experiment: (.+)$/)[1]; i+=1
+ result.primers = lines[i].match(/^Primers: (.+)$/)[1]; i+=1
+ result.target = lines[i].match(/^Target: (.+)$/)[1]; i+=1
+ result.matches = lines[i].match(/^Matches: (.+)$/)[1]; i+=1
+ result.product = lines[i].match(/^Product: (.+)$/)[1]; i+=1
+ result.result_type = lines[i].match(/^Result type: (.+)$/)[1]
+
+ i+= 2
+ result.forward_matching_sequence = lines[i].match(/^\.\.\.(\w+)\.+ \# forward$/)[1]
+ i+= 2
+ matching = lines[i].match(/^5'\-(\w+)-3' 3'\-(\w+)-5' \# primers$/)
+ result.forward_primer_sequence = matching[1]
+ result.reverse_primer_sequence = matching[2]
+ i+= 2
+ result.reverse_matching_sequence = lines[i].match(/^\.+(\w+)\.\.\. \# revcomp$/)[1]
+
+ i+= 2
+ matching = lines[i].match(/^ipcress: (\S+) (\S+) (\d+) [AB] (\d+) (\d+) [AB] (\d+) (\d+) (\S+)$/)
+ result.length = matching[3].to_i
+ result.start = matching[4].to_i
+ result.forward_mismatches = matching[5].to_i
+ result.reverse_mismatches = matching[7].to_i
+
+ results.push result
+ end
+ return results
+ end
+
+ # A collection of Ipcress Result objects for a given run
+ class Results < Array
+ end
+
+ # Ipcress single result (single primer pair match)
+ #
+ # Attributes of this class should be largely obvious by inspecting the Ipcress output
+ class Result
+ attr_accessor :experiment_name
+
+ attr_accessor :primers, :target, :matches, :product, :result_type
+
+ # A String representing the matching part of the sequence
+ attr_accessor :forward_matching_sequence
+
+ # A String representing the matching primer
+ attr_accessor :forward_primer_sequence
+
+ # A String representing the matching primer
+ attr_accessor :reverse_primer_sequence
+
+ # A String representing the matching part of the sequence
+ attr_accessor :reverse_matching_sequence
+
+ attr_accessor :length, :forward_mismatches, :start, :reverse_mismatches
+
+ # When there are wobbles in the primers, Ipcress always reports at
+ # least 1 mismatch (1 for all matching wobbles plus regular mismatches).
+ #
+ # This method recalculates the mismatches by re-aligning the primers
+ # against the sequences that they hit in this Result.
+ #
+ # Returns an array of 2 values corresponding to the number of mismatches
+ # in the forward and revcomp primers, respectively.
+ #
+ # Assumes that there is only wobbles in the primers, not the sequence
+ # (Does ipcress itself assume this?)
+ def recalculate_mismatches_from_alignments
+ calculate_mismatches = lambda do |seq, primer|
+ mismatches = 0
+ (0..(seq.length-1)).each do |position|
+ regex = Bio::Sequence::NA.new(primer[position].downcase).to_re
+ seqp = seq[position].downcase
+ mismatches += 1 unless regex.match(seqp)
+ end
+ mismatches
+ end
+
+ return [
+ calculate_mismatches.call(@forward_matching_sequence, @forward_primer_sequence),
+ calculate_mismatches.call(@reverse_matching_sequence, @reverse_primer_sequence)
+ ]
+ end
+ end
+ end
+end
@@ -0,0 +1,22 @@
+>gi|335929284|gb|JN048683.1| Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+GGTCACTGCTATCGGTGTCCGATTAAGCCATGCGAGTCGTGAGGGGTGAGCCCTCGGCGTACTGCTCAGT
+AACACGTGGACAATCTGCCCAAAAGTCCGGGATAACCCCGGGAAACTGGGGATAATACCGGATAGGCCAC
+CAATTGCTGGAATGGCCTGGTGGTTGAAACGAGAGGCTTTTGGATGGGTCTGCGGCGGATTAGGTTGACG
+CCGGTGTAACGTACCGGCGTGCCTGTAATCCGTACGGGTTGTGGGAGCAAGAGCCCGGAGATGGATTCTG
+AGACACGAATCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTCTACAATGCAGGCAATCTGCGATAGG
+GGGACATCGAGTGGCATCTTCTTAAGGTGCCTGTCCAACCGTCTAAAAAACGGTTGTTAGCAAGGGCCGG
+GTAAGACCGGTGCCAGCCGCCGCGGTAATACCGGCGGCTCGAGTGGTGGCCGATATTATTGAGTCTAAAG
+GGTCCGTAGCCGGCTTTGCAAGTCCCCTGGGAAATCCAGCGGCTTAACCGTTGGGCGCCCATGGGATACT
+ACATTGCTTGGGACTGGGAGAGGCGAGAGGTACTCGAGGGGTAGGGGTGAAATCCTGTAATCCTTCGGGG
+ACCACCGGTGGCGAAGGCGTCTCGCCAGAACAGGTCCGACGGTGAGGGACGAAAGCTAGGGGCACGAACC
+GGATTAGATACCCGGGTAGTCCTAGCCGTAAACGATGCCCGCTAGGTGTCACGATAATCGTGAATTATCG
+TGGTGCCGTAGGGAAGCCGCGAAGCGGGCCACTTGGGAAGTACGACCGCAAGGTTGAAACTTAAAGGAAT
+TGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGATTCAACGCCGGGAAGCTTACCGGGA
+TCGACAGTTGAATGAAGGCCAGGCCGAAGACCTTGCCGGACTAGCTGAGAGGAGGTGCATGGCCGTCGTC
+AGTTCGTACCGTGAGGCGTCCTGTTAAGTCAGGCAACGAGCGAGACCCATGTCCACTGTTGCTAACGCGT
+CCGCGAGGACGGCGAGTACACTGTGGAGACTGCTGGCGCTAAGTCAGAGGAAGGGTTGGTCGACGGTAGG
+TCAGTATGCCCCGAATATCCCGGGCTACACGCGGGCTACAATGGACAGGACAATGGGTAACAACACCGAG
+AGGTGAAGTTAATCTCTTAAACCTGTCCCTAGTTCGGATTGAGGGCTGAAACCCGCCCTCATGAAGATGG
+AATCCGTAGTAATCGCATTTCAAAACAGTGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCA
+AACCACCCGAGCGGGGCGTGAATAAGGACTCTTTTCCTTGAAGAGCTCGAATTCACGTTCTGTAAGGGGG
+GTTAAGTCGTAACAAGGTAGCC
@@ -0,0 +1,43 @@
+>gi|335929284|gb|JN048683.1| Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+GGTCACTGCTATCGGTGTCCGATTAAGCCATGCGAGTCGTGAGGGGTGAGCCCTCGGCGTACTGCTCAGT
+AACACGTGGACAATCTGCCCAAAAGTCCGGGATAACCCCGGGAAACTGGGGATAATACCGGATAGGCCAC
+CAATTGCTGGAATGGCCTGGTGGTTGAAACGAGAGGCTTTTGGATGGGTCTGCGGCGGATTAGGTTGACG
+CCGGTGTAACGTACCGGCGTGCCTGTAATCCGTACGGGTTGTGGGAGCAAGAGCCCGGAGATGGATTCTG
+AGACACGAATCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTCTACAATGCAGGCAATCTGCGATAGG
+GGGACATCGAGTGGCATCTTCTTAAGGTGCCTGTCCAACCGTCTAAAAAACGGTTGTTAGCAAGGGCCGG
+GTAAGACCGGTGCCAGCCGCCGCGGTAATACCGGCGGCTCGAGTGGTGGCCGATATTATTGAGTCTAAAG
+GGTCCGTAGCCGGCTTTGCAAGTCCCCTGGGAAATCCAGCGGCTTAACCGTTGGGCGCCCATGGGATACT
+ACATTGCTTGGGACTGGGAGAGGCGAGAGGTACTCGAGGGGTAGGGGTGAAATCCTGTAATCCTTCGGGG
+ACCACCGGTGGCGAAGGCGTCTCGCCAGAACAGGTCCGACGGTGAGGGACGAAAGCTAGGGGCACGAACC
+GGATTAGATACCCGGGTAGTCCTAGCCGTAAACGATGCCCGCTAGGTGTCACGATAATCGTGAATTATCG
+TGGTGCCGTAGGGAAGCCGCGAAGCGGGCCACTTGGGAAGTACGACCGCAAGGTTGAAACTTAAAGGAAT
+TGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGATTCAACGCCGGGAAGCTTACCGGGA
+TCGACAGTTGAATGAAGGCCAGGCCGAAGACCTTGCCGGACTAGCTGAGAGGAGGTGCATGGCCGTCGTC
+AGTTCGTACCGTGAGGCGTCCTGTTAAGTCAGGCAACGAGCGAGACCCATGTCCACTGTTGCTAACGCGT
+CCGCGAGGACGGCGAGTACACTGTGGAGACTGCTGGCGCTAAGTCAGAGGAAGGGTTGGTCGACGGTAGG
+TCAGTATGCCCCGAATATCCCGGGCTACACGCGGGCTACAATGGACAGGACAATGGGTAACAACACCGAG
+AGGTGAAGTTAATCTCTTAAACCTGTCCCTAGTTCGGATTGAGGGCTGAAACCCGCCCTCATGAAGATGG
+AATCCGTAGTAATCGCATTTCAAAACAGTGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCA
+AACCACCCGAGCGGGGCGTGAATAAGGACTCTTTTCCTTGAAGAGCTCGAATTCACGTTCTGTAAGGGGG
+GTTAAGTCGTAACAAGGTAGCC
+GGTCACTGCTATCGGTGTCCGATTAAGCCATGCGAGTCGTGAGGGGTGAGCCCTCGGCGTACTGCTCAGT
+AACACGTGGACAATCTGCCCAAAAGTCCGGGATAACCCCGGGAAACTGGGGATAATACCGGATAGGCCAC
+CAATTGCTGGAATGGCCTGGTGGTTGAAACGAGAGGCTTTTGGATGGGTCTGCGGCGGATTAGGTTGACG
+CCGGTGTAACGTACCGGCGTGCCTGTAATCCGTACGGGTTGTGGGAGCAAGAGCCCGGAGATGGATTCTG
+AGACACGAATCCAGGCCCTACGGGGCGCAGCAGGCGCGAAAACTCTACAATGCAGGCAATCTGCGATAGG
+GGGACATCGAGTGGCATCTTCTTAAGGTGCCTGTCCAACCGTCTAAAAAACGGTTGTTAGCAAGGGCCGG
+GTAAGACCGGTGCCAGCCGCCGCGGTAATACCGGCGGCTCGAGTGGTGGCCGATATTATTGAGTCTAAAG
+GGTCCGTAGCCGGCTTTGCAAGTCCCCTGGGAAATCCAGCGGCTTAACCGTTGGGCGCCCATGGGATACT
+ACATTGCTTGGGACTGGGAGAGGCGAGAGGTACTCGAGGGGTAGGGGTGAAATCCTGTAATCCTTCGGGG
+ACCACCGGTGGCGAAGGCGTCTCGCCAGAACAGGTCCGACGGTGAGGGACGAAAGCTAGGGGCACGAACC
+GGATTAGATACCCGGGTAGTCCTAGCCGTAAACGATGCCCGCTAGGTGTCACGATAATCGTGAATTATCG
+TGGTGCCGTAGGGAAGCCGCGAAGCGGGCCACTTGGGAAGTACGACCGCAAGGTTGAAACTTAAAGGAAT
+TGGCGGGGGAGCACCACAACGGGTGGAGCCTGCGGTTTAATTGGATTCAACGCCGGGAAGCTTACCGGGA
+TCGACAGTTGAATGAAGGCCAGGCCGAAGACCTTGCCGGACTAGCTGAGAGGAGGTGCATGGCCGTCGTC
+AGTTCGTACCGTGAGGCGTCCTGTTAAGTCAGGCAACGAGCGAGACCCATGTCCACTGTTGCTAACGCGT
+CCGCGAGGACGGCGAGTACACTGTGGAGACTGCTGGCGCTAAGTCAGAGGAAGGGTTGGTCGACGGTAGG
+TCAGTATGCCCCGAATATCCCGGGCTACACGCGGGCTACAATGGACAGGACAATGGGTAACAACACCGAG
+AGGTGAAGTTAATCTCTTAAACCTGTCCCTAGTTCGGATTGAGGGCTGAAACCCGCCCTCATGAAGATGG
+AATCCGTAGTAATCGCATTTCAAAACAGTGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCA
+AACCACCCGAGCGGGGCGTGAATAAGGACTCTTTTCCTTGAAGAGCTCGAATTCACGTTCTGTAAGGGGG
+GTTAAGTCGTAACAAGGTAGCC
@@ -0,0 +1 @@
+AE12_pmid21856836_16S AAACTYAAAKGAATTGRCGG ACGGGCGGTGTGTRC 2 10000
@@ -0,0 +1 @@
+AE12_pmid21856836_16S AAACTYAAAAKGAATTGRCGG ACGGGCGGTGTGTRC 2 10000
@@ -0,0 +1,2 @@
+AE12_pmid21856836_16S_1 AAACTYAAAKGAATTGRCGG ACGGGCGGTGTGTRC 2 10000
+AE12_pmid21856836_16S_mismatch AATCTYAAAKGAATTGRCGG ACGGGCGGTGTGTRC 2 10000
View
@@ -1,7 +1,125 @@
require 'helper'
+require 'stringio'
class TestBioIpcress < Test::Unit::TestCase
- should "probably rename this file and start testing for real" do
- flunk "hey buddy, you should probably rename this file and start testing for real"
+ should "test 1 result" do
+ ipcress = "
+Ipcress result
+--------------
+ Experiment: AE12_pmid21856836_16S
+ Primers: A B
+ Target: gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+ Matches: 19/20 14/15
+ Product: 502 bp (range 2-10000)
+Result type: forward
+
+...AAACTTAAAGGAATTGGCGG......................... # forward
+ ||||| ||| |||||| |||-->
+5'-AAACTYAAAKGAATTGRCGG-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ <--| |||||||||||||
+..............................CGTGTGTGGCGGGCA... # revcomp
+--
+ipcress: gi|335929284|gb|JN048683.1|:filter(unmasked) AE12_pmid21856836_16S 502 A 826 1 B 1313 1 forward
+-- completed ipcress analysis"
+ results = Bio::Ipcress.parse(ipcress)
+ assert_equal 1, results.length
+ res = results[0]
+ assert_equal 'AE12_pmid21856836_16S', res.experiment_name
+ assert_equal 'A B', res.primers
+ assert_equal 'gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence',
+ res.target
+ assert_equal '19/20 14/15', res.matches
+ assert_equal '502 bp (range 2-10000)', res.product
+ assert_equal 'forward', res.result_type
+ assert_equal 'AAACTTAAAGGAATTGGCGG', res.forward_matching_sequence
+ assert_equal 'AAACTYAAAKGAATTGRCGG', res.forward_primer_sequence
+ assert_equal 'CRTGTGTGGCGGGCA', res.reverse_primer_sequence
+ assert_equal 'CGTGTGTGGCGGGCA', res.reverse_matching_sequence
+ assert_equal 502, res.length
+ assert_equal 826, res.start
+ assert_equal 1, res.forward_mismatches
+ assert_equal 1, res.reverse_mismatches
+ end
+
+ should "test multi-experiment results" do
+ ipcress = "
+Ipcress result
+--------------
+ Experiment: AE12_pmid21856836_16S
+ Primers: B B
+ Target: gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+ Matches: 10/15 14/15
+ Product: 1085 bp (range 2-10000)
+Result type: single_B
+
+...ACGGGTTGTGGGAGC......................... # forward
+ ||||| ||| | |-->
+5'-ACGGGCGGTGTGTRC-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ <--| |||||||||||||
+.........................CGTGTGTGGCGGGCA... # revcomp
+--
+ipcress: gi|335929284|gb|JN048683.1|:filter(unmasked) AE12_pmid21856836_16S 1085 B 243 5 B 1313 1 single_B
+
+Ipcress result
+--------------
+ Experiment: AE12_pmid21856836_16S
+ Primers: A B
+ Target: gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+ Matches: 16/21 14/15
+ Product: 503 bp (range 2-10000)
+Result type: forward
+
+...GAAACTTAAAGGAATTGGCGG......................... # forward
+ || ||| |||||| |||-->
+5'-AAACTYAAAAKGAATTGRCGG-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ <--| |||||||||||||
+...............................CGTGTGTGGCGGGCA... # revcomp
+--
+ipcress: gi|335929284|gb|JN048683.1|:filter(unmasked) AE12_pmid21856836_16S 503 A 825 5 B 1313 1 forward
+
+Ipcress result
+--------------
+ Experiment: AE12_pmid21856836_16S
+ Primers: B B
+ Target: gi|335929284|gb|JN048683.1|:filter(unmasked) Methanocella conradii HZ254 16S ribosomal RNA gene, partial sequence
+ Matches: 10/15 14/15
+ Product: 470 bp (range 2-10000)
+Result type: single_B
+
+...ACGGGTGGAGCCTGC......................... # forward
+ ||||| || | | |-->
+5'-ACGGGCGGTGTGTRC-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ <--| |||||||||||||
+.........................CGTGTGTGGCGGGCA... # revcomp
+--
+ipcress: gi|335929284|gb|JN048683.1|:filter(unmasked) AE12_pmid21856836_16S 470 B 858 5 B 1313 1 single_B
+-- completed ipcress analysis
+"
+ results = Bio::Ipcress.parse(ipcress)
+ assert_equal 3, results.length
+ assert_equal 5, results[0].forward_mismatches
+ end
+
+ should "test recalculation of matches" do
+ res = Bio::Ipcress::Result.new
+
+ #...AAACTTAAAGGAATTGGCGG......................... # forward
+ # ||||| ||| |||||| |||-->
+ #5'-AAACTYAAAKGAATTGRCGG-3' 3'-CRTGTGTGGCGGGCA-5' # primers
+ # <--| |||||||||||||
+ #..............................CGTGTGTGGCGGGCA... # revcomp
+ res.forward_matching_sequence = 'AAACTTAAAGGAATTGGCGG'
+ res.forward_primer_sequence = 'AAACTYAAAKGAATTGRCGG'
+ res.reverse_matching_sequence = 'CGTGTGTGGCGGGCA'
+ res.reverse_primer_sequence = 'CRTGTGTGGCGGGCA'
+ assert_equal [0,0], res.recalculate_mismatches_from_alignments
+
+ res.reverse_primer_sequence = 'CRTGTGTGGCGGGCT'
+ assert_equal [0,1], res.recalculate_mismatches_from_alignments
+
+
+ res.forward_primer_sequence = 'AAACTRAAAKGAATTGRCGG'
+ res.reverse_primer_sequence = 'CRTGTGTGGCGGGCA'
+ assert_equal [1,0], res.recalculate_mismatches_from_alignments, "mismatching wobble R"
end
end

0 comments on commit 6513b8c

Please sign in to comment.