Skip to content

Commit

Permalink
new more modular genarator/parser code
Browse files Browse the repository at this point in the history
  • Loading branch information
wstrinz committed Oct 10, 2013
1 parent 72388f9 commit 0e3bc99
Show file tree
Hide file tree
Showing 16 changed files with 382 additions and 222 deletions.
16 changes: 10 additions & 6 deletions lib/publisci.rb
Expand Up @@ -30,17 +30,21 @@ def load_folder(folder)
end

load_folder('publisci/mixins')

load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'
load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
load File.dirname(__FILE__) + '/publisci/parser.rb'
load File.dirname(__FILE__) + '/publisci/post_processor.rb'
load File.dirname(__FILE__) + '/publisci/analyzer.rb'
load File.dirname(__FILE__) + '/publisci/store.rb'
load File.dirname(__FILE__) + '/publisci/dataset/interactive.rb'

load File.dirname(__FILE__) + '/publisci/dataset/data_cube.rb'
load File.dirname(__FILE__) + '/publisci/dataset/dataset_for.rb'
load File.dirname(__FILE__) + '/publisci/dataset/configuration.rb'
load File.dirname(__FILE__) + '/publisci/dataset/dataset.rb'

load_folder('publisci/parsers')
load_folder('publisci/generators')

load File.dirname(__FILE__) + '/publisci/query/query_helper.rb'
load File.dirname(__FILE__) + '/publisci/post_processor.rb'
load File.dirname(__FILE__) + '/publisci/analyzer.rb'
load File.dirname(__FILE__) + '/publisci/store.rb'
load File.dirname(__FILE__) + '/publisci/datacube_model.rb'
load File.dirname(__FILE__) + '/publisci/output.rb'
load File.dirname(__FILE__) + '/publisci/metadata/prov/element.rb'
Expand Down
4 changes: 2 additions & 2 deletions lib/publisci/dataset/ORM/data_cube_orm.rb
Expand Up @@ -5,13 +5,13 @@ class DataCube
extend PubliSci::Dataset::DataCube
extend PubliSci::Analyzer
extend PubliSci::Query
extend PubliSci::Parser
extend PubliSci::RDFParser

include PubliSci::Dataset::DataCube
include PubliSci::Analyzer
include PubliSci::Metadata::Generator
include PubliSci::Query
include PubliSci::Parser
include PubliSci::RDFParser

attr_accessor :labels
attr_accessor :dimensions
Expand Down
2 changes: 1 addition & 1 deletion lib/publisci/dataset/data_cube.rb
Expand Up @@ -8,7 +8,7 @@ def unindent
module PubliSci
class Dataset
module DataCube
include PubliSci::Parser
include PubliSci::RDFParser
def defaults
{
type: :dataframe,
Expand Down
22 changes: 22 additions & 0 deletions lib/publisci/generators/base.rb
@@ -0,0 +1,22 @@
module PubliSci
module Generators
module Base
include PubliSci::Dataset::DataCube

def write(*args)
raise "Should be overriden"
end
alias_method :generate_n3, :write

def write_to(out, string)
out.write string
end

def close_output(out)
if out.is_a? File
out.close
end
end
end
end
end
172 changes: 172 additions & 0 deletions lib/publisci/generators/maf.rb
@@ -0,0 +1,172 @@
module PubliSci
module Generators
class MAF
extend Base

COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}

COMPONENT_RANGES = { "Tumor_Sample_Barcode" => "xsd:string", "Start_position" => "xsd:int", "Center" => "xsd:string", "NCBI_Build" => "xsd:int", "Chromosome" => "xsd:int" }

TCGA_CODES =
{
"Variant_Classification" => %w{Frame_Shift_Del Frame_Shift_Ins In_Frame_Del In_Frame_Ins Missense_Mutation Nonsense_Mutation Silent Splice_Site Translation_Start_Site Nonstop_Mutation 3'UTR 3'Flank 5'UTR 5'Flank IGR1 Intron RNA Targeted_Region},
"Variant_Type" => %w{SNP DNP TNP ONP INS DEL Consolidated},
"dbSNP_Val_Status" => %w{by1000genomes by2Hit2Allele byCluster byFrequency byHapMap byOtherPop bySubmitter alternate_allele},
"Verification_Status" => %w{Verified, Unknown},
"Validation_Status" => %w{Untested Inconclusive Valid Invalid},
"Mutation_Status" => %w{None Germline Somatic LOH Post-transcriptional modification Unknown},
"Sequence_Source" => %w{WGS WGA WXS RNA-Seq miRNA-Seq Bisulfite-Seq VALIDATION Other ncRNA-Seq WCS CLONE POOLCLONE AMPLICON CLONEEND FINISHING ChIP-Seq MNase-Seq DNase-Hypersensitivity EST FL-cDNA CTS MRE-Seq MeDIP-Seq MBD-Seq Tn-Seq FAIRE-seq SELEX RIP-Seq ChIA-PET},
"Sequencer" => ["Illumina GAIIx", "Illumina HiSeq", "SOLID", "454", "ABI 3730xl", "Ion Torrent PGM", "Ion Torrent Proton", "PacBio RS", "Illumina MiSeq", "Illumina HiSeq 2500", "454 GS FLX Titanium", "AB SOLiD 4 System" ]
}

BARCODE_INDEX = COLUMN_NAMES.index('Tumor_Sample_Barcode')

class << self
def write(record, out, label, options={})

options = process_options(options)

options[:no_labels] ||= true
options[:lookup_hugo] ||= false
options[:complex_objects] ||= false
options[:ranges] ||= COMPONENT_RANGES

write_to(out, process_line(record, label, options))
end

def write_structure(input, output, options)
write_to(output, structure(options))
end

def process_options(options)
options[:dimensions] = dimensions = %w{Variant_Classification Variant_Type dbSNP_Val_Status Verification_Status Validation_Status Mutation_Status Sequence_Source Sequencer}
options[:codes] = codes = dimensions
options[:measures] = (COLUMN_NAMES - dimensions - codes)
options[:dataset_name] ||= "MAF_#{Time.now.nsec.to_s(32)}"

options
end

def process_line(entry,label,options)
entry = (entry.fill(nil,entry.length...COLUMN_NAMES.length-2) + parse_barcode(entry[BARCODE_INDEX])).flatten

entry[0] = "http://identifiers.org/hgnc.symbol/#{entry[0]}" if entry[0]

# A 0 in the entrez-id column appears to mean null
col=1
entry[col] = nil if entry[col] == '0'
entry[col] = "http://identifiers.org/ncbigene/#{entry[col]}" if entry[col]

# Only link non-novel dbSNP entries
col = COLUMN_NAMES.index('dbSNP_RS')
if entry[col] && entry[col][0..1] == "rs"
entry[col] = "http://identifiers.org/dbsnp/#{entry[col].gsub('rs','')}"
end

# optionally create typed objects using sio nodes
if options[:complex_objects]
entry = sio_values(entry)
end

data = {}
COLUMN_NAMES.each_with_index{|col,i|
data[col] = [entry[i]]
}

observations(options[:measures],options[:dimensions],options[:codes],data,[label],options[:dataset_name],options).first
end

def sio_values(entry)
entry[0] = sio_value('http://edamontology.org/data_1791',entry[0]) if entry[0]

# Link entrez genes
col=1
entry[col] = sio_value("http://identifiers.org/ncbigene",entry[col]) if entry[col]

col = COLUMN_NAMES.index('dbSNP_RS')
entry[col] = sio_value("http://identifiers.org/dbsnp", entry[col])

# test SIO attributes for chromosome
col = COLUMN_NAMES.index('Chromosome')
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0000340",entry[col])

# More SIO attrtibutes for alleles
%w{Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2}.each{|name|
col = COLUMN_NAMES.index(name)
entry[col] = sio_value("http://purl.org/obo/owl/SO#SO_0001023",entry[col])
}

col = COLUMN_NAMES.index("Strand")
entry[col] = sio_attribute("http://edamontology.org/data_0853",entry[col])

col = COLUMN_NAMES.index("Center")
entry[col] = sio_attribute("foaf:homepage",entry[col])

# Use faldo for locations End_Position
col = COLUMN_NAMES.index("Start_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#begin", entry[col],"http://biohackathon.org/resource/faldo#Position")

col = COLUMN_NAMES.index("End_Position")
entry[col] = sio_attribute("http://biohackathon.org/resource/faldo#end", entry[col],"http://biohackathon.org/resource/faldo#Position")

entry
end

def structure(options={})

options = process_options(options)

str = prefixes(options[:dataset_name],options)
str << data_structure_definition(options[:measures],options[:dimensions],options[:codes],options[:dataset_name],options)
str << dataset(options[:dataset_name],options)
component_specifications(options[:measures], options[:dimensions], options[:codes], options[:dataset_name], options).map{ |c| str << c }
measure_properties(options[:measures],options[:dataset_name],options).map{|m| str << m}
dimension_properties(options[:dimensions],options[:codes], options[:dataset_name],options).map{|d| str << d}
code_lists(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}
concept_codes(options[:codes],TCGA_CODES,options[:dataset_name],options).map{|c| str << c}

str
end

def post_process(file)
reg = %r{http://identifiers.org/hgnc.symbol/(\w+)}
hugo_cache ||= {}
PubliSci::PostProcessor.process(file,file,reg){|g|
hugo_cache[g] ||= official_symbol(g)
'http://identifiers.org/hgnc.symbol/' + cache[g]
}
end

def column_replace(entry,column,prefix,value=nil)
if value
entry[COLUMN_NAMES.index(column)] = prefix + value
else
entry[COLUMN_NAMES.index(column)] += prefix
end
end

def official_symbol(hugo_symbol)
qry = <<-EOF
SELECT distinct ?official where {
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> "#{hugo_symbol}"}
UNION
{?hgnc <http://bio2rdf.org/hgnc_vocabulary:synonym> "#{hugo_symbol}"}
?hgnc <http://bio2rdf.org/hgnc_vocabulary:approved_symbol> ?official
}
EOF

sparql = SPARQL::Client.new("http://cu.hgnc.bio2rdf.org/sparql")
sparql.query(qry).map(&:official).first.to_s
end

def parse_barcode(code)
#TCGA-E9-A22B-01A-11D-A159-09
[code[5..11], code[13..-1]]
end
end
end
end
end
2 changes: 1 addition & 1 deletion lib/publisci/metadata/generator.rb
Expand Up @@ -7,7 +7,7 @@ def unindent
module PubliSci
class Metadata
module Generator
include PubliSci::Parser
include PubliSci::RDFParser

def defaults
{
Expand Down
2 changes: 1 addition & 1 deletion lib/publisci/parser.rb
@@ -1,5 +1,5 @@
module PubliSci
module Parser
module RDFParser

def is_uri?(obj)
RDF::Resource(obj).valid?
Expand Down
29 changes: 29 additions & 0 deletions lib/publisci/parsers/base.rb
@@ -0,0 +1,29 @@
module PubliSci
module Parsers
module Base
include Enumerable
# attr_accessor :dataset_name, :measures, :dimensions, :codes

def valid?(rec)
true
end

def enum_method
:each
end

def process_record(rec)
rec
end

def each(input)
input.send(enum_method) do |rec|
yield process_record(rec) if valid? rec
end
end
alias_method :each_rec, :each
alias_method :each_record, :each

end
end
end
20 changes: 20 additions & 0 deletions lib/publisci/parsers/maf.rb
@@ -0,0 +1,20 @@
module PubliSci
module Parsers
class MAF
extend Base
COLUMN_NAMES = %w{ Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer Tumor_Sample_UUID Matched_Norm_Sample_UUID patient_id sample_id}

def self.valid?(line)
not (line[0] == "#" || line[0..3] == "Hugo")
end

def enum_method
:each_line
end

def self.process_record(rec)
::CSV.parse(rec, {col_sep: "\t"}).flatten[0..(COLUMN_NAMES.length-3)]
end
end
end
end
2 changes: 1 addition & 1 deletion lib/publisci/readers/base.rb
Expand Up @@ -2,7 +2,7 @@ module PubliSci
module Readers
class Base
include PubliSci::Query
include PubliSci::Parser
include PubliSci::RDFParser
include PubliSci::Analyzer
include PubliSci::Interactive
include PubliSci::Dataset::DataCube
Expand Down

0 comments on commit 0e3bc99

Please sign in to comment.