new more modular genarator/parser code

wstrinz · Oct 10, 2013 · 0e3bc99 · 0e3bc99
1 parent 72388f9
commit 0e3bc99
Show file tree

Hide file tree

Showing 16 changed files with 382 additions and 222 deletions.
diff --git a/lib/publisci.rb b/lib/publisci.rb
@@ -30,17 +30,21 @@ def load_folder(folder)
 end
 
 load_folder('publisci/mixins')
-
-load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
-load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
 load File.dirname(__FILE__) + '/publisci/parser.rb'
-load File.dirname(__FILE__) + '/publisci/post_processor.rb'
-load File.dirname(__FILE__) + '/publisci/analyzer.rb'
-load File.dirname(__FILE__) + '/publisci/store.rb'
+load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
+
 load File.dirname(__FILE__) + '/publisci/dataset/data_cube.rb'
 load File.dirname(__FILE__) + '/publisci/dataset/dataset_for.rb'
 load File.dirname(__FILE__) + '/publisci/dataset/configuration.rb'
 load File.dirname(__FILE__) + '/publisci/dataset/dataset.rb'
+
+load_folder('publisci/parsers')
+load_folder('publisci/generators')
+
+load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
+load File.dirname(__FILE__) + '/publisci/post_processor.rb'
+load File.dirname(__FILE__) + '/publisci/analyzer.rb'
+load File.dirname(__FILE__) + '/publisci/store.rb'
 load File.dirname(__FILE__) + '/publisci/datacube_model.rb'
 load File.dirname(__FILE__) + '/publisci/output.rb'
 load File.dirname(__FILE__) + '/publisci/metadata/prov/element.rb'

diff --git a/lib/publisci/dataset/ORM/data_cube_orm.rb b/lib/publisci/dataset/ORM/data_cube_orm.rb
@@ -5,13 +5,13 @@ class DataCube
         extend PubliSci::Dataset::DataCube
         extend PubliSci::Analyzer
         extend PubliSci::Query
-        extend PubliSci::Parser
+        extend PubliSci::RDFParser
 
         include PubliSci::Dataset::DataCube
         include PubliSci::Analyzer
         include PubliSci::Metadata::Generator
         include PubliSci::Query
-        include PubliSci::Parser
+        include PubliSci::RDFParser
 
         attr_accessor :labels
         attr_accessor :dimensions

diff --git a/lib/publisci/dataset/data_cube.rb b/lib/publisci/dataset/data_cube.rb
@@ -8,7 +8,7 @@ def unindent
 module PubliSci
   class Dataset
     module DataCube
-      include PubliSci::Parser
+      include PubliSci::RDFParser
       def defaults
       {
         type: :dataframe,

diff --git a/lib/publisci/generators/base.rb b/lib/publisci/generators/base.rb
@@ -0,0 +1,22 @@
+module PubliSci
+  module Generators
+    module Base
+      include PubliSci::Dataset::DataCube
+
+      def write(*args)
+        raise "Should be overriden"
+      end
+      alias_method :generate_n3, :write
+
+      def write_to(out, string)
+        out.write string
+      end
+
+      def close_output(out)
+        if out.is_a? File
+          out.close
+        end
+      end
+    end
+  end
+end
diff --git a/lib/publisci/generators/maf.rb b/lib/publisci/generators/maf.rb
@@ -0,0 +1,172 @@
+module PubliSci
+  module Generators
+    class MAF
+      extend Base
+
+      COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS  dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1  Match_Norm_Seq_Allele2  Tumor_Validation_Allele1  Tumor_Validation_Allele2  Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase  Sequence_Source Validation_Method Score BAM_File  Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
+
+      COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }
+
+      TCGA_CODES =
+        {
+          "Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1  Intron RNA Targeted_Region},
+          "Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
+          "dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
+          "Verification_Status" => %w{Verified, Unknown},
+          "Validation_Status" => %w{Untested Inconclusive Valid Invalid},
+          "Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
+          "Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
+          "Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
+        }
+
+      BARCODE_INDEX = COLUMN_NAMES.index('Tumor_Sample_Barcode')
+
+      class << self
+        def write(record, out, label, options={})
+
+            options = process_options(options)
+
+            options[:no_labels] ||= true
+            options[:lookup_hugo] ||= false
+            options[:complex_objects] ||= false
+            options[:ranges] ||= COMPONENT_RANGES
+
+            write_to(out, process_line(record, label, options))
+        end
+
+        def write_structure(input, output, options)
+          write_to(output, structure(options))
+        end
+
+        def process_options(options)
+          options[:dimensions] = dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
+          options[:codes] = codes = dimensions
+          options[:measures] = (COLUMN_NAMES - dimensions - codes)
+          options[:dataset_name] ||= "MAF_#{Time.now.nsec.to_s(32)}"
+
+          options
+        end
+
+        def process_line(entry,label,options)
+            entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[BARCODE_INDEX])).flatten
+
+            entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]
+
+            # A 0 in the entrez-id column appears to mean null
+            col=1
+            entry[col] = nil if entry[col] == '0'
+            entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]
+
+            # Only link non-novel dbSNP entries
+            col = COLUMN_NAMES.index('dbSNP_RS')
+            if entry[col] && entry[col][0..1] == "rs"
+              entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
+            end
+
+            # optionally create typed objects using sio nodes
+            if options[:complex_objects]
+              entry = sio_values(entry)
+            end
+
+            data = {}
+            COLUMN_NAMES.each_with_index{|col,i|
+              data[col] = [entry[i]]
+            }
+
+            observations(options[:measures],options[:dimensions],options[:codes],data,[label],options[:dataset_name],options).first
+        end
+
+        def sio_values(entry)
+          entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]
+
+          # Link entrez genes
+          col=1
+          entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]
+
+          col = COLUMN_NAMES.index('dbSNP_RS')
+          entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])
+
+          # test SIO attributes for chromosome
+          col = COLUMN_NAMES.index('Chromosome')
+          entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])
+
+          # More SIO attrtibutes for alleles
+          %w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
+            col = COLUMN_NAMES.index(name)
+            entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
+          }
+
+          col = COLUMN_NAMES.index("Strand")
+          entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])
+
+          col = COLUMN_NAMES.index("Center")
+          entry[col] = sio_attribute("foaf:homepage",entry[col])
+
+          # Use faldo for locations End_Position
+          col = COLUMN_NAMES.index("Start_Position")
+          entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")
+
+          col = COLUMN_NAMES.index("End_Position")
+          entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")
+
+          entry
+        end
+
+        def structure(options={})
+
+          options = process_options(options)
+
+          str = prefixes(options[:dataset_name],options)
+          str << data_structure_definition(options[:measures],options[:dimensions],options[:codes],options[:dataset_name],options)
+          str << dataset(options[:dataset_name],options)
+          component_specifications(options[:measures], options[:dimensions], options[:codes], options[:dataset_name], options).map{ |c| str << c }
+          measure_properties(options[:measures],options[:dataset_name],options).map{|m| str << m}
+          dimension_properties(options[:dimensions],options[:codes], options[:dataset_name],options).map{|d| str << d}
+          code_lists(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
+          concept_codes(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
+
+          str
+        end
+
+        def post_process(file)
+          reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
+          hugo_cache ||= {}
+          PubliSci::PostProcessor.process(file,file,reg){|g|
+            hugo_cache[g] ||= official_symbol(g)
+           'http://identifiers.org/hgnc.symbol/' + cache[g]
+         }
+        end
+
+        def column_replace(entry,column,prefix,value=nil)
+          if value
+            entry[COLUMN_NAMES.index(column)] = prefix + value
+          else
+            entry[COLUMN_NAMES.index(column)] += prefix
+          end
+        end
+
+        def official_symbol(hugo_symbol)
+          qry = <<-EOF
+
+          SELECT distinct ?official where {
+           {?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
+           UNION
+           {?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
+
+           ?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
+          }
+
+          EOF
+
+          sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
+          sparql.query(qry).map(&:official).first.to_s
+        end
+
+        def parse_barcode(code)
+          #TCGA-E9-A22B-01A-11D-A159-09
+          [code[5..11], code[13..-1]]
+        end
+      end
+    end
+  end
+end
diff --git a/lib/publisci/metadata/generator.rb b/lib/publisci/metadata/generator.rb
@@ -7,7 +7,7 @@ def unindent
 module PubliSci
   class Metadata
     module Generator
-      include PubliSci::Parser
+      include PubliSci::RDFParser
 
       def defaults
       {

diff --git a/lib/publisci/parser.rb b/lib/publisci/parser.rb
@@ -1,5 +1,5 @@
 module PubliSci
-	module Parser
+	module RDFParser
 
     def is_uri?(obj)
       RDF::Resource(obj).valid?

diff --git a/lib/publisci/parsers/base.rb b/lib/publisci/parsers/base.rb
@@ -0,0 +1,29 @@
+module PubliSci
+  module Parsers
+    module Base
+      include Enumerable
+      # attr_accessor :dataset_name, :measures, :dimensions, :codes
+
+      def valid?(rec)
+        true
+      end
+
+      def enum_method
+        :each
+      end
+
+      def process_record(rec)
+        rec
+      end
+
+      def each(input)
+        input.send(enum_method) do |rec|
+          yield process_record(rec) if valid? rec
+        end
+      end
+      alias_method :each_rec, :each
+      alias_method :each_record, :each
+
+    end
+  end
+end
diff --git a/lib/publisci/parsers/maf.rb b/lib/publisci/parsers/maf.rb
@@ -0,0 +1,20 @@
+module PubliSci
+  module Parsers
+    class MAF
+      extend Base
+      COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS  dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1  Match_Norm_Seq_Allele2  Tumor_Validation_Allele1  Tumor_Validation_Allele2  Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase  Sequence_Source Validation_Method Score BAM_File  Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}
+
+      def self.valid?(line)
+        not (line[0] == "#" || line[0..3] == "Hugo")
+      end
+
+      def enum_method
+        :each_line
+      end
+
+      def self.process_record(rec)
+        ::CSV.parse(rec, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
+      end
+    end
+  end
+end
diff --git a/lib/publisci/readers/base.rb b/lib/publisci/readers/base.rb
@@ -2,7 +2,7 @@ module PubliSci
   module Readers
     class Base
       include PubliSci::Query
-      include PubliSci::Parser
+      include PubliSci::RDFParser
       include PubliSci::Analyzer
       include PubliSci::Interactive
       include PubliSci::Dataset::DataCube