Skip to content

Commit

Permalink
Merge pull request #734 from tadast/tt/lazy-report
Browse files Browse the repository at this point in the history
Refactor BLAST::Report to lazily generate reports
  • Loading branch information
yannickwurm committed Mar 21, 2024
2 parents 4e9721a + 389ae3a commit 0f9acaa
Show file tree
Hide file tree
Showing 10 changed files with 17,047 additions and 87 deletions.
160 changes: 74 additions & 86 deletions lib/sequenceserver/blast/report.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,11 @@ module BLAST
class Report < Report
def initialize(job)
super do
@queries = []
@querydb = job.databases
end
end

# Attributes parsed out from BLAST output.
attr_reader :program, :program_version, :stats, :queries

# Attributes parsed from job metadata and BLAST output.
attr_reader :querydb, :dbtype, :params

def to_json(*_args)
generate

%i[querydb program program_version params stats
queries].inject({}) do |h, k|
h[k] = send(k)
Expand All @@ -53,83 +44,89 @@ def to_json(*_args)
def xml_file_size
return File.size(job.imported_xml_file) if job.imported_xml_file

generate

xml_formatter.size
end

# Generate report.
def generate
return self if @_generated

job.raise!
xml_ir = nil
tsv_ir = nil
if job.imported_xml_file
xml_ir = parse_xml File.read(job.imported_xml_file)
tsv_ir = Hash.new do |h1, k1|
h1[k1] = Hash.new do |h2, k2|
h2[k2] = ['', '', []]
end
end
else
xml_ir = parse_xml(xml_formatter.read_file)
tsv_ir = parse_tsv(tsv_formatter.read_file)
def done?
return true if job.imported_xml_file

File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
end

def program
@program ||= xml_ir[0]
end

def program_version
@program_version ||= xml_ir[1]
end

def querydb
@querydb ||= xml_ir[3].split.map do |path|
{ title: File.basename(path) }
end
extract_program_info xml_ir
extract_db_info xml_ir
extract_params xml_ir
extract_stats xml_ir
extract_queries xml_ir, tsv_ir
end

def dbtype
@dbtype ||= querydb&.first&.type || dbtype_from_program
end

@_generated = true
def params
@params ||= extract_params
end

self
def stats
@stats ||= extract_stats
end

def done?
return true if job.imported_xml_file
def queries
@queries ||= xml_ir[8].map do |n|
query = Query.new(self, n[0], n[2], n[3], [])
query.hits = query_hits(n[4], tsv_ir[query.id], query)

File.exist?(xml_formatter.filepath) && File.exist?(tsv_formatter.filepath)
query
end
end

private

def xml_formatter
@xml_formatter ||= Formatter.run(job, 'xml')
def xml_ir
@xml_ir ||=
if job.imported_xml_file
parse_xml File.read(job.imported_xml_file)
else
job.raise!
parse_xml(xml_formatter.read_file)
end
end

def tsv_formatter
@tsv_formatter ||= Formatter.run(job, 'custom_tsv')
def tsv_ir
@tsv_ir ||=
if job.imported_xml_file
Hash.new do |h1, k1|
h1[k1] = Hash.new do |h2, k2|
h2[k2] = ['', '', []]
end
end
else
job.raise!
parse_tsv(tsv_formatter.read_file)
end
end

# Make program name and program name + version available via `program`
# and `program_version` attributes.
def extract_program_info(ir)
@program = ir[0]
@program_version = ir[1]
def xml_formatter
@xml_formatter ||= Formatter.run(job, 'xml')
end

# Get database information (title and type) from job yaml or from XML.
# Sets `querydb` and `dbtype` attributes.
def extract_db_info(ir)
if @querydb.empty?
@querydb = ir[3].split.map do |path|
{ title: File.basename(path) }
end
@dbtype = dbtype_from_program
else
@dbtype = @querydb.first.type
end
def tsv_formatter
@tsv_formatter ||= Formatter.run(job, 'custom_tsv')
end

# Make search params available via `params` attribute.
#
# Search params tweak the results. Like evalue cutoff or penalty to open
# a gap. BLAST+ doesn't list all input params in the XML output. Only
# matrix, evalue, gapopen, gapextend, and filters are available from XML
# output.
def extract_params(ir)
def extract_params
# Parse/get params from the job first.
job_params = parse_advanced(job.advanced)
# Old jobs from beta releases may not have the advanced key but they
Expand All @@ -138,21 +135,19 @@ def extract_params(ir)

# Parse params from BLAST XML.
@params = Hash[
*ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
*xml_ir[7].first.map { |k, v| [k.gsub('Parameters_', ''), v] }.flatten
]
@params['evalue'] = @params.delete('expect')

# Merge into job_params.
@params = job_params.merge(@params)
end

# Make search stats available via `stats` attribute.
#
# Search stats are computed metrics. Like total number of sequences or
# effective search space.
def extract_stats(ir)
stats = ir[8].first[5][0]
@stats = {
def extract_stats
stats = xml_ir[8].first[5][0]
{
nsequences: stats[0],
ncharacters: stats[1],
hsp_length: stats[2],
Expand All @@ -163,20 +158,11 @@ def extract_stats(ir)
}
end

# Create query objects for the given report from the given ir.
def extract_queries(xml_ir, tsv_ir)
xml_ir[8].each do |n|
query = Query.new(self, n[0], n[2], n[3], [])
extract_hits(n[4], tsv_ir[query.id], query)
queries << query
end
end

# Create Hit objects for the given query from the given ir.
def extract_hits(xml_ir, tsv_ir, query)
return if xml_ir == ["\n"] # => No hits.
def query_hits(xml_ir, tsv_ir, query)
return [] if xml_ir == ["\n"] # => No hits.

xml_ir.each do |n|
xml_ir.map do |n|
# If hit comes from a non -parse_seqids database, then id (n[1]) is a
# BLAST assigned internal id of the format 'gnl|BL_ORD_ID|serial'. We
# assign the id to accession (because we use accession for sequence
Expand All @@ -190,19 +176,21 @@ def extract_hits(xml_ir, tsv_ir, query)
n[1] = defline.shift
n[2] = defline.join(' ')
end

hit = Hit.new(query, n[0], n[1], n[3], n[2], n[4],
tsv_ir[n[1]][0], tsv_ir[n[1]][1], [])
extract_hsps(n[5], tsv_ir[n[1]][2], hit)
query.hits << hit

hit.hsps = hsps(n[5], tsv_ir[n[1]][2], hit)

hit
end
end

# Create HSP objects for the given hit from the given ir.
def extract_hsps(xml_ir, tsv_ir, hit)
xml_ir.each_with_index do |n, i|
def hsps(xml_ir, tsv_ir, hit)
xml_ir.map.with_index do |n, i|
n.insert(14, tsv_ir[i])
hsp = HSP.new(hit, *n)
hit.hsps << hsp

HSP.new(hit, *n)
end
end

Expand Down
2 changes: 1 addition & 1 deletion lib/sequenceserver/report.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ module SequenceServer
class Report
class << self
def generate(job)
BLAST::Report.new(job).generate
BLAST::Report.new(job)
end
end

Expand Down
38 changes: 38 additions & 0 deletions spec/blast/report_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
require 'spec_helper'
require 'sequenceserver/report'
require 'sequenceserver/blast/report'

module SequenceServer
RSpec.describe BLAST::Job do
before do
FileUtils.mkdir_p(DOTDIR)
FileUtils.rm_r(File.join(DOTDIR, job_id)) if File.exist?(File.join(DOTDIR, job_id))
FileUtils.cp_r(File.join(__dir__, '..', 'fixtures', job_id), DOTDIR)

# For all files in DOTDIR/job_id, replace $PATH_PREFIX with the root dir of the project.
job_dir = File.join(DOTDIR, job_id)
root_dir = File.expand_path(File.join(__dir__, '..', '..', '..'))
Dir[File.join(job_dir, '**', '*')].each do |f|
File.write(f, File.read(f).gsub('$PATH_PREFIX', root_dir)) if File.file?(f)
end

SequenceServer.init
end

let(:job_id) { '38334a72-e8e7-4732-872b-24d3f8723563' }
let(:job) { SequenceServer::Job.fetch(job_id) }
let(:report) { BLAST::Report.new(job) }
let(:keys_to_ignore) { %i[querydb submitted_at imported_xml seqserv_version cloud_sharing_enabled] }

describe "#to_json" do
it "returns a JSON representation of the job" do
actual_report = JSON.parse(report.to_json).reject { |k, _| keys_to_ignore.include?(k) }
expected_report = JSON.parse(File.read(File.join(job.dir, 'expected_outputs/frontend.json'))).reject { |k, _| keys_to_ignore.include?(k) }

actual_report.each do |k, v|
expect(v).to eq(expected_report[k])
end
end
end
end
end

Large diffs are not rendered by default.

Loading

0 comments on commit 0f9acaa

Please sign in to comment.