From ff98581411dab145a664c1523c75c03b893ce027 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Mon, 13 Jun 2022 14:36:21 +0200 Subject: [PATCH] Implementing new in-house background for svs (#32) Closes: #32 Related-Issue: #32 Projected-Results-Impact: require-revalidation --- HISTORY.rst | 1 + config/settings/base.py | 3 + docs_manual/admin_upgrade.rst | 32 + importer/models.py | 1 - svs/bg_db.py | 687 ++++++++++++++++++ svs/forms.py | 76 +- .../commands/svs_bg_sv_set_build.py | 25 + .../commands/svs_bg_sv_set_cleanup.py | 29 + svs/management/commands/svs_bg_sv_set_dump.py | 35 + svs/management/commands/svs_bg_sv_set_list.py | 28 + svs/management/commands/svs_sv_fill_nulls.py | 26 +- svs/management/commands/svs_svs_dump.py | 67 ++ ...roundsvsetjob_cleanupbackgroundsvsetjob.py | 186 +++++ .../0018_extend_structuralvariant.py | 14 +- ...groundsv_svs_backgro_bg_sv_s_72b34f_idx.py | 20 + svs/models.py | 133 +++- svs/plugins.py | 8 +- svs/queries.py | 83 ++- svs/tasks.py | 45 +- svs/templates/svs/build_bg_job_detail.html | 99 +++ svs/templates/svs/cleanup_bg_job_detail.html | 99 +++ svs/templates/svs/filter_form/frequency.html | 47 +- svs/templates/svs/filter_form/misc.html | 10 +- svs/templates/svs/filter_result/header.html | 2 +- svs/templates/svs/filter_result/row_sv.html | 10 +- svs/templates/svs/filter_result/table.html | 5 +- svs/tests/factories.py | 65 +- svs/tests/test_bg_db.py | 562 ++++++++++++++ svs/tests/test_models.py | 18 + svs/tests/test_queries.py | 168 ++--- svs/urls.py | 11 + svs/views.py | 37 +- variants/models.py | 2 + 33 files changed, 2358 insertions(+), 276 deletions(-) create mode 100644 svs/bg_db.py create mode 100644 svs/management/commands/svs_bg_sv_set_build.py create mode 100644 svs/management/commands/svs_bg_sv_set_cleanup.py create mode 100644 svs/management/commands/svs_bg_sv_set_dump.py create mode 100644 svs/management/commands/svs_bg_sv_set_list.py create mode 100644 svs/management/commands/svs_svs_dump.py create mode 100644 svs/migrations/0017_backgroundsv_backgroundsvset_buildbackgroundsvsetjob_cleanupbackgroundsvsetjob.py create mode 100644 svs/migrations/0019_backgroundsv_svs_backgro_bg_sv_s_72b34f_idx.py create mode 100644 svs/templates/svs/build_bg_job_detail.html create mode 100644 svs/templates/svs/cleanup_bg_job_detail.html create mode 100644 svs/tests/test_bg_db.py diff --git a/HISTORY.rst b/HISTORY.rst index 390e92c8a..6cc46ee43 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -80,6 +80,7 @@ Full Change List - Fixed bug where Exac and thousand genomes settings were not shown in frequency tab for GRCh37 (#597). - Form template reports error if genomebuild variable is not set (#607). - Making ``keyvalue`` more robust to failure (#613). +- Implement new in-house background database for structural variants (#32). ------ v1.2.0 diff --git a/config/settings/base.py b/config/settings/base.py index 734d14efa..76a2c6e60 100644 --- a/config/settings/base.py +++ b/config/settings/base.py @@ -508,6 +508,9 @@ def set_logging(level=None): # Number of cases to perform in one query for joint queries. QUERY_MAX_UNION = env.int("VARFISH_QUERY_MAX_UNION", 20) +# Timeout (in hours) for VarFish cleaning up background SV sets in "building" state. +SV_CLEANUP_BUILDING_SV_SETS = env.int("VARFISH_SV_CLEANUP_BUILDING_SV_SETS", 48) + # Varfish: Exomiser # ------------------------------------------------------------------------------ diff --git a/docs_manual/admin_upgrade.rst b/docs_manual/admin_upgrade.rst index 18a23c883..4fc932ec4 100644 --- a/docs_manual/admin_upgrade.rst +++ b/docs_manual/admin_upgrade.rst @@ -26,6 +26,38 @@ You can then apply the patch to your database with the following command. You can find out more details, give feedback, and ask for help `in this Github discussion `__. +---------------- +v1.2.* to v2.*.* +---------------- + +**Structural Variants.** +In case that the support for structural variants has been used, it is **strongly recommended** to re-annotate the structural variants with an updated version of ``varfish-annotator`` (v0.24 or above). +You will need to use ``varfish-cli`` in a recent version (v0.3.4 or above) for being able to import the data into VarFish. +Otherwise, see below for background and explanation on how to fill empty values in your database after upgrading the software an database to bollonaster. + +In VarFish anthenea, the support for structural variants has been experimental only. +To prepare for improved support of structural variants, the database structure has been changed. +Prior, the number of carriers in the background had to be annotated outside of VarFish (in your pipeline). +This has changed now and VarFish can build a background database (and will do so weekly on sundays). + +If you do not want to wait for next sunday to get your background database, you can force building a new background structural variant data set with: + +:: + + $ docker exec -it varfish-docker-compose_varfish-web_1 python /usr/src/app/manage.py \ + svs_bg_sv_set_build + +New fields have been added for the proper representation of break-ends in VarFish. +They will only be properly filled after re-import of re-annotated data as described above. +You can fill the fields with reasonable values (that will work well for all cases except for breakends/BNDs where the connectivity of 3' and 5' ends cannot be properly detected without re-annotation) by using the following. + +:: + + $ docker exec -it varfish-docker-compose_varfish-web_1 python /usr/src/app/manage.py \ + svs_sv_fill_nulls + +This is not strictly necessary and it is recommended to re-annotate and re-import. + ------------------ v0.23.0 to v1.2.1 ------------------ diff --git a/importer/models.py b/importer/models.py index 385e52588..e8d598d93 100644 --- a/importer/models.py +++ b/importer/models.py @@ -4,7 +4,6 @@ import tempfile import uuid as uuid_object -import aldjemy from bgjobs.models import BackgroundJob, LOG_LEVEL_ERROR, JobModelMessageMixin from django.contrib import auth from django.db import models, transaction diff --git a/svs/bg_db.py b/svs/bg_db.py new file mode 100644 index 000000000..d2efb210b --- /dev/null +++ b/svs/bg_db.py @@ -0,0 +1,687 @@ +"""Code that supports building the structural variant background database.""" +import gc +import math +import os +import tempfile + +import attrs +from contextlib import contextmanager +import enum +import json +import logging +import pathlib +import random +import re +import statistics +import sys +import typing + +import binning +import cattr +from django.conf import settings +from django.db import transaction +from django.db.models import Q +from django.utils import timezone +from intervaltree import Interval, IntervalTree +from projectroles.plugins import get_backend_api +import psutil +from sqlalchemy import delete + +from svs.models import ( + SV_SUB_TYPE_CHOICES as _SV_SUB_TYPE_CHOICES, + SV_SUB_TYPE_BND as _SV_SUB_TYPE_BND, + SV_SUB_TYPE_INS as _SV_SUB_TYPE_INS, + StructuralVariant, + BackgroundSvSet, + BuildBackgroundSvSetJob, + BackgroundSv, + CleanupBackgroundSvSetJob, +) +from varfish import __version__ as varfish_version + + +#: Logger to use in this module. +from variants.helpers import get_engine, get_meta +from variants.models import CHROMOSOME_NAMES, CHROMOSOME_STR_TO_CHROMOSOME_INT + +LOGGER = logging.getLogger(__name__) + +#: The SV types to be used in ``SvRecord``. +SV_TYPES = [t[0] for t in _SV_SUB_TYPE_CHOICES] + + +@attrs.define +class ClusterAlgoParams: + """Parameters for the clustering algorithm""" + + #: Seed to use for random numbers. + seed: int = 42 + #: Maximal size of a cluster before subsampling. + cluster_max_size: int = 500 + #: Maximal size of a cluster after subsampling. + cluster_size_sample_to: int = 100 + #: Minimal jaccard overlap. + min_jaccard_overlap: float = 0.7 + #: Slack to allow around breakend breakends (and INSertions). + bnd_slack: int = 50 + + +def _file_name_safe(s: str) -> str: + """Make the given string file name safe. + + This is done by making all not explicitely allowed character underscores. + """ + return re.sub(r"[^a-zA-Z]", "_", s) + + +class PairedEndOrientation(enum.Enum): + """Enumeration type for SV connectivity (important for deciding for merge compatibility).""" + + #: Indicate 3' to 5' connection. + THREE_TO_FIVE = "3to5" + #: Indicate 5' to 3' connection. + FIVE_TO_THREE = "5to3" + #: Indicate 3' to 3' connection. + THREE_TO_THREE = "3to3" + #: Indicate 5' to 5' connection. + FIVE_TO_FIVE = "5to5" + + +@attrs.define(frozen=True) +class GenotypeCounts: + """Represents genotype counts for an SV record.""" + + #: Number of source records (families) this record was built from + src_count: int = 0 + #: Number of carriers + carriers: int = 0 + #: Number of heterozygous carriers + carriers_het: int = 0 + #: Number of homozygous carriers + carriers_hom: int = 0 + #: Number of hemizygous carriers + carriers_hemi: int = 0 + + def plus(self, other: "GenotypeCounts") -> "GenotypeCounts": + """Add ``self`` and ``other``.""" + return GenotypeCounts( + src_count=self.src_count + other.src_count, + carriers=self.carriers + other.carriers, + carriers_het=self.carriers_het + other.carriers_het, + carriers_hom=self.carriers_hom + other.carriers_hom, + carriers_hemi=self.carriers_hemi + other.carriers_hemi, + ) + + +@attrs.define(frozen=True) +class SvRecord: + """Represents a structural variant record as used in the SV clustering algorithm. + + The clustering algorithm currently ignores the uncertainty specification of variants (IOW: ci_*). + """ + + #: Genome build version + release: str + #: The structural variant type + sv_type: str + #: The chromosome of the left chromosomal position + chrom: str + #: 1-based start position + pos: int + #: The chromosome of the right chromosomal position + chrom2: str + #: 1-based end position of the variant + end: int + #: Paired-end connectivity type + orientation: typing.Optional[PairedEndOrientation] = None + #: Genotype counds + counts: GenotypeCounts = attrs.field(factory=GenotypeCounts) + + def does_overlap(self, other: "SvRecord", *, bnd_slack: typing.Optional[int] = None) -> bool: + """Returns whether the two records overlap.""" + if self.release != other.release: # pragma: nocover + raise ValueError(f"Incompatible release values: {self.release} vs {other.release}") + if self.sv_type != other.sv_type: # pragma: nocover + raise ValueError(f"Incompatible sv_type values: {self.sv_type} vs {other.sv_type}") + if (bnd_slack is None) == (self.is_bnd() or self.is_ins()): + raise ValueError(f"Should specify bnd_slack if and only if SV is a breakend (or INS)") + if self.is_bnd(): # break-end, potentially non-linear SV + return ( + self.chrom == other.chrom + and abs(self.pos - other.pos) <= bnd_slack + and self.chrom2 == other.chrom2 + and abs(self.end - other.end) <= bnd_slack + ) + elif self.is_ins(): # insertion / "point SV" + return self.chrom == other.chrom and abs(self.pos - other.pos) <= bnd_slack + else: # linear SV + return self.chrom == other.chrom and self.pos <= other.end and self.end >= other.pos + + def jaccard_index(self, other: "SvRecord") -> typing.Optional[float]: + """Return jaccard index of overlap between ``self`` and ``other``. + + Raises an ``ValueError` if ``release`` or ``sv_type`` are not the same. + """ + if self.is_bnd() or other.is_bnd() or self.is_ins() or other.is_ins(): + raise ValueError(f"Cannot compute Jaccard overlap for break-ends and INS!") + if self.does_overlap(other): + len_union = max(self.end, other.end) + 1 - min(self.pos, other.pos) + len_intersect = min(self.end, other.end) + 1 - max(self.pos, other.pos) + return len_intersect / len_union + else: + return 0.0 + + def is_compatible(self, other: "SvRecord", bnd_slack: int) -> bool: + """Determine whether the two records ``self`` and ``other`` are compatible to be merged in principle. + + That is, they must be of the same SV type and overlap. + """ + if self.sv_type != other.sv_type: + return False + if self.is_bnd(): # => other.is_bnd() is True + return ( + self.does_overlap(other, bnd_slack=bnd_slack) + and self.orientation == other.orientation + ) + elif self.is_ins(): # => other.is_ins() is True + return self.does_overlap(other, bnd_slack=bnd_slack) + else: + return self.does_overlap(other) + + def is_bnd(self) -> bool: + """Returns whether the record is a breakend.""" + return self.sv_type == _SV_SUB_TYPE_BND + + def is_ins(self) -> bool: + """Returns whether the record is an insertion.""" + return self.sv_type.startswith(_SV_SUB_TYPE_INS) + + def build_interval( + self, *, data: typing.Optional[typing.Any] = None, bnd_slack: int = 0 + ) -> Interval: + if self.is_bnd() or self.is_ins(): + return Interval(self.pos - bnd_slack, self.end + bnd_slack, data) + else: + return Interval(self.pos, self.end, data) + + def sort_key(self): + return (self.chrom, self.pos) + + +@attrs.define +class SvCluster: + """Define one SV cluster by a median representation and backing records""" + + #: The clustering algorithm parameters. + params: ClusterAlgoParams + #: The random number generator to use + rng: random.Random = attrs.field(repr=False) + #: The mean representing ``SvRecord`` + mean: typing.Optional[SvRecord] = None + #: The list of records backing the cluster + records: typing.List[SvRecord] = attrs.field(default=attrs.Factory(list)) + #: The overall genotype counts + counts: GenotypeCounts = attrs.field(factory=GenotypeCounts) + + def augment(self, record: SvRecord) -> None: + """Augment the given cluster + + Raises ``ValueError`` if record incompatible with mean (if exists) by release, + sv_type, chrom, or chrom2. + """ + if self.mean: + for key in ("release", "sv_type", "chrom", "chrom2"): + if getattr(record, key) != getattr(self.mean, key): # pragma: nocover + raise ValueError(f"Incompatible record ({record}) vs mean ({self.mean}") + if not self.mean.is_compatible(record, bnd_slack=self.params.bnd_slack): + raise ValueError(f"Incompatible record ({record}) vs mean ({self.mean}") + # Perform augmentation and sub-sample records when necessary. Update mean in any case. + self._augment(record) + if len(self.records) > self.params.cluster_max_size: + self._sub_sample() + self.mean = self._compute_mean() + self.counts = self.counts.plus(record.counts) + + def _augment(self, record: SvRecord) -> None: + """Augment cluster with the given ``record``""" + self.records.append(record) + + def _sub_sample(self) -> None: + """Sub sample the records of this cluster and re-compute mean""" + self.records = self.rng.choices(self.records, k=self.params.cluster_size_sample_to) + self.mean = self._compute_mean() + + def _compute_mean(self) -> SvRecord: + """Compute the mean of this cluster's record""" + pos = math.floor(statistics.mean([r.pos for r in self.records])) + end = math.ceil(statistics.mean([r.end for r in self.records])) + end = max(end, pos + 1) + return attrs.evolve(self.records[0], pos=pos, end=end,) + + def sort_key(self): + if self.mean is None: + return ("", -1000) + else: + return self.mean.sort_key() + + def normalized(self): + """Normalized output (e.g., for tests)""" + return attrs.evolve(self, records=list(sorted(self.records, key=SvRecord.sort_key))) + + +class ClusterSvAlgorithm: + """This class encapsulates the state of the SV clustering algorithm using ``attrs`` based data structures. + + Data is processed per ``(variant_type, chromosome)`` in memory and stored. + + Protocol is: + + .. code-block:: python + + params = ClusterAlgoParams() + algo = ClusterSvAlgorithm(params) + for chrom in ["chr1", "chr2"]: + db_records = DATABASE_QUERY() + with algo.on_chrom(chrom): # accept pushes in block + for db_record in db_records: + sv_record: SvRecord = BUILD_SV_RECORD(db_record) + algo.push(sv_record) + clusters = algo.cluster() + # ... + + """ + + def __init__(self, params: ClusterAlgoParams): + self.params = params + #: Which chromosome the algorithm is on, if any. + self.current_chrom: typing.Optional[str] = None + #: Clusters on the current chromosome, if any + self.clusters: typing.List[SvRecord] = None + #: Chromosomes that the algorithm has seen so far. + self.seen_chroms: typing.Set[str] = set() + #: Temporary directory to write to. + self.tmp_dir: typing.Optional[pathlib.Path] = None + #: Temporary storage file per SV type. + self.tmp_files: typing.Dict[str, typing.IO[str]] = {} + #: The random number generator to use + self.rng: random.Random = random.Random(self.params.seed) + + @contextmanager + def on_chrom(self, chrom: str): + """Start clustering for a new chromosome used as a context manager. + + On exiting the context manager, all temporary files will be closed. + """ + LOGGER.error("Starting collection of SV records for chromosome %s", chrom) + if chrom in self.seen_chroms: # pragma: nocover + raise RuntimeError(f"Seen chromosome {chrom} already!") + else: + self.seen_chroms.add(chrom) + self.current_chrom = chrom + self.clusters = None + with tempfile.TemporaryDirectory() as tmp_dir: + LOGGER.error("Opening temporary files for chrom %s", chrom) + self.tmp_dir = pathlib.Path(tmp_dir) + for sv_type in SV_TYPES: + self.tmp_files[sv_type] = (self.tmp_dir / _file_name_safe(sv_type)).open("wt+") + yield + LOGGER.error("Closing temporary files again for chrom %s", chrom) + for tmp_file in self.tmp_files.values(): + tmp_file.close() + self.tmp_dir = None + self.tmp_files = {} + LOGGER.error("Done with collecting SV records for chromosome %s", chrom) + + def push(self, record: SvRecord) -> None: + """Push the ``record`` on the current chromosome to the appropriate file in ``self.tmp_dir``.""" + if not self.tmp_files: # pragma: nocover + raise RuntimeError("Invalid state, for push(), no temporary files open!") + LOGGER.debug("Writing record %s", record) + print(json.dumps(cattr.unstructure(record)), file=self.tmp_files[record.sv_type]) + + def cluster(self) -> typing.List[SvCluster]: + """Execute the clustering for the current chromosome and return clusters. + + The resulting clusters will be sorted by start position (empty clusters first) + """ + if not self.tmp_files: # pragma: nocover + raise RuntimeError("Invalid state, for cluster(), no temporary files open!") + if self.clusters is None: + LOGGER.error("Starting clustering on chromosome %s", self.current_chrom) + self.clusters = [] + for sv_type, tmp_file in self.tmp_files.items(): + process = psutil.Process(os.getpid()) + rss_mb = process.memory_info().rss // 1024 // 1024 + LOGGER.info( + f"... clustering SV type {sv_type} with RSS {rss_mb} MB", file=sys.stderr + ) + print( + f"... clustering SV type {sv_type} with RSS {rss_mb} MB", file=sys.stderr + ) # XXX + tmp_file.flush() + tmp_file.seek(0) + self.clusters += self._cluster_impl(sv_type, tmp_file) + self.clusters.sort(key=SvCluster.sort_key) + LOGGER.error("Done with clustering on chromosome %s", self.current_chrom) + return self.clusters + + def _cluster_impl(self, sv_type: str, tmp_file: typing.IO[str]) -> typing.List[SvCluster]: + """Implementation of the clustering step for a given SV type and with a given temporary file""" + #: Load records from disk and shuffle + sv_records = [] + for line in tmp_file: + LOGGER.debug("Read line %s", repr(line)) + sv_record = cattr.structure(json.loads(line), SvRecord) + if sv_record.sv_type != sv_type: # pragma: nocover + raise ValueError( + f"Unexpected SV type. Is: {sv_record.sv_type}, expected {sv_type}." + ) + sv_records.append(sv_record) + self.rng.shuffle(sv_records) + + # Maintain an interval tree of clusters (interval is cluster mean). Go over the SV records in random + # order (implied after shuffling above) and assign to best fitting compatible cluster. + tree = IntervalTree() + clusters: typing.List[SvCluster] = [] + for sv_record in sv_records: + # Find all overlapping clusters from the interval tree + sv_interval = sv_record.build_interval(bnd_slack=self.params.bnd_slack) + ovl_intervals: typing.Set[Interval] = tree.overlap(sv_interval.begin, sv_interval.end) + ovl_indices: typing.List[int] = [interval.data for interval in ovl_intervals] + best_index: typing.Optional[int] = None + best_jaccard: typing.Optional[float] = None + + # Identify the best overlapping cluster, if any + for curr_index in ovl_indices: + curr_mean = clusters[curr_index].mean + if sv_record.is_bnd() or sv_record.is_ins(): + if sv_record.is_compatible(curr_mean, bnd_slack=self.params.bnd_slack): + best_index = curr_index + else: + curr_jaccard = curr_mean.jaccard_index(sv_record) + if best_index is None or curr_jaccard > best_jaccard: + best_index = curr_index + best_jaccard = curr_jaccard + + # Create new cluster or update existing one + if best_index is None: + LOGGER.debug("Found no cluster for SV record %s (create new one)", sv_record) + # print("Found no cluster for SV record %s (create new one)" % sv_record) + # Create new cluster and add it to the tree with initial mean. + best_index = len(clusters) + sv_cluster = SvCluster(params=self.params, rng=self.rng) + sv_cluster.augment(sv_record) + clusters.append(sv_cluster) + tree.add(sv_cluster.mean.build_interval(data=best_index)) + else: + LOGGER.debug( + "Found cluster %s for SV record %s (will update)", + clusters[best_index], + sv_record, + ) + # print( + # "Found cluster %s for SV record %s (will update)" + # % (clusters[best_index], sv_record), + # ) + # Remove cluster from tree temporarily, update cluster, add back to tree with new mean. + sv_cluster = clusters[best_index] + tree.remove(sv_cluster.mean.build_interval(data=best_index)) + sv_cluster.augment(sv_record) + tree.add(sv_cluster.mean.build_interval(data=best_index)) + + return clusters + + +def _fixup_sv_type(sv_type: str) -> str: + return sv_type.replace("_", ":") + + +def _fill_null_counts(record, sex=None): + """Helper to fill NULL ``num_*`` fields in ``record`` based on genotype and optionally a mapping of + sample name to PED sex. + """ + sex = sex or {} + record.num_hom_alt = 0 + record.num_hom_ref = 0 + record.num_het = 0 + record.num_hemi_alt = 0 + record.num_hemi_ref = 0 + for k, v in record.genotype.items(): + gt = v["gt"] + k_sex = sex.get(record.case_id, {}).get(k, 0) + if gt == "1": + record.num_hemi_alt += 1 + elif gt == "0": + record.num_hemi_ref += 1 + elif gt in ("0/1", "1/0", "0|1", "1|0"): + record.num_het += 1 + elif gt in ("0/0", "0|0"): + if "x" in record.chromosome.lower() and k_sex == 1: + record.num_hemi_ref += 1 + else: + record.num_hom_ref += 1 + elif gt in ("1|1", "1|1"): + if "x" in record.chromosome.lower() and k_sex == 1: + record.num_hemi_alt += 1 + else: + record.num_hom_alt += 1 + + +def sv_model_to_attrs(model_record: StructuralVariant) -> SvRecord: + """Conversion from ``StructuralVariant`` to ``SvRecord`` for use in clustering.""" + _fill_null_counts(model_record) + counts = GenotypeCounts( + src_count=1, + carriers=(model_record.num_het or 0) + + (model_record.num_hom_alt or 0) + + (model_record.num_hemi_alt or 0), + carriers_het=model_record.num_het or 0, + carriers_hom=model_record.num_hom_alt or 0, + carriers_hemi=model_record.num_hemi_alt or 0, + ) + # We write out chrom2=chrom1 etc. to work around NULL values left-over from old SVs + return SvRecord( + release=model_record.release, + sv_type=_fixup_sv_type(model_record.sv_type), + chrom=model_record.chromosome, + pos=model_record.start, + chrom2=model_record.chromosome2 or model_record.chromosome, + end=model_record.end, + orientation=model_record.pe_orientation, + counts=counts, + ) + + +def sv_cluster_to_model_args(sv_cluster: SvCluster) -> typing.Dict[str, typing.Any]: + """Conversion from ``SvCluster`` to args for creation of ``BackgroundSv``.""" + chrom_nochr = ( + sv_cluster.mean.chrom + if not sv_cluster.mean.chrom.startswith("chrm") + else sv_cluster.mean.chrom[3:] + ) + chrom2_nochr = ( + sv_cluster.mean.chrom2 + if not sv_cluster.mean.chrom.startswith("chrm") + else sv_cluster.mean.chrom2[3:] + ) + mean = sv_cluster.mean + if mean.chrom == mean.chrom2: + bin = binning.assign_bin(mean.pos, mean.end) + else: + bin = binning.assign_bin(mean.pos, mean.pos + 1) + return { + "release": mean.release, + "chromosome": mean.chrom, + "chromosome_no": CHROMOSOME_STR_TO_CHROMOSOME_INT.get(chrom_nochr, 0), + "start": mean.pos, + "chromosome2": mean.chrom2, + "chromosome_no2": CHROMOSOME_STR_TO_CHROMOSOME_INT.get(chrom2_nochr, 0), + "end": mean.end, + "pe_orientation": mean.orientation.value if mean.orientation else None, + "bin": bin, + "sv_type": sv_cluster.mean.sv_type, + **cattr.unstructure(sv_cluster.counts), + } + + +def _build_bg_sv_set_impl( + job: BuildBackgroundSvSetJob, + *, + log_to_stderr: bool = False, + chromosomes: typing.Optional[typing.List[str]] = None, +) -> BackgroundSvSet: + genomebuild = job.genomebuild + chrom_pat = "%s" if genomebuild == "GRCh37" else "chr%s" + + if log_to_stderr: + + def log(msg: str): + job.add_log_entry(msg) + if log_to_stderr: + print(msg, file=sys.stderr) + + else: + log = job.add_log_entry + + log("Creating new bg_db_set in state 'initial'") + bg_sv_set = BackgroundSvSet.objects.create( + genomebuild=job.genomebuild, varfish_version=varfish_version, state="building" + ) + + log("Starting actual clustering") + params = ClusterAlgoParams() + algo = ClusterSvAlgorithm(params) + record_count = 0 + + for chrom_name in chromosomes or CHROMOSOME_NAMES: + chrom = chrom_pat % chrom_name + log("Starting with chromosome %s for genome build %s" % (chrom, genomebuild)) + chunk_size = 10_000 + with algo.on_chrom(chrom): # accept pushes in block + for num, db_record in enumerate( + StructuralVariant.objects.filter(release=genomebuild, chromosome=chrom).iterator( + chunk_size=chunk_size + ) + ): + sv_record = sv_model_to_attrs(db_record) + algo.push(sv_record) + record_count += 1 + if num % 10_000 == 0: + if log_to_stderr: + process = psutil.Process(os.getpid()) + rss_mb = process.memory_info().rss // 1024 // 1024 + print(f"... at record {num} with RSS {rss_mb} MB", file=sys.stderr) + gc.collect() + clusters = algo.cluster() + log("Built %d clusters from %d records" % (len(clusters), record_count)) + + log("Constructing background SV set records...") + for cluster in clusters: + BackgroundSv.objects.create(bg_sv_set=bg_sv_set, **sv_cluster_to_model_args(cluster)) + + log("... done constructing background SV set records.") + log("Done with chromosome %s for genome build %s" % (chrom, genomebuild)) + + with transaction.atomic(): + bg_sv_set.refresh_from_db() + bg_sv_set.state = "active" + bg_sv_set.save() + return bg_sv_set + + +def build_bg_sv_set( + job: BuildBackgroundSvSetJob, + *, + log_to_stderr: bool = False, + chromosomes: typing.Optional[typing.List[str]] = None, +) -> BackgroundSvSet: + """Construct a new ``BackgroundSvSet`` """ + job.mark_start() + timeline = get_backend_api("timeline_backend") + if timeline: + tl_event = timeline.add_event( + project=None, + app_name="svs", + user=None, + event_name="svs_build_bg_sv_set", + description="build background sv set", + status_type="INIT", + ) + try: + job.add_log_entry("Starting creation of background SV set...") + result = _build_bg_sv_set_impl(job, log_to_stderr=log_to_stderr, chromosomes=chromosomes) + job.add_log_entry("... done creating background SV set.") + except Exception as e: + job.mark_error(e) + if timeline: + tl_event.set_status("FAILED", "failed to build background sv set") + raise + else: + job.mark_success() + if timeline: + tl_event.set_status("OK", "building background sv set complete") + return result + + +def _cleanup_bg_sv_sets( + job: CleanupBackgroundSvSetJob, *, timeout_hours: typing.Optional[int] = None +) -> None: + meta = get_meta() + sa_table_set = meta.tables[BackgroundSvSet._meta.db_table] + query_set = delete(sa_table_set) + + # Keep latest two active + active_sets = BackgroundSvSet.objects.filter(state="active").order_by("-date_created") + keep_ids = [] + if active_sets.count(): + keep_ids += [s.id for s in active_sets[:2]] + # Keep building ones that are younger than ``build_timeout_hours`` + if timeout_hours >= 0: + hours_ago = timezone.now() - timezone.timedelta(hours=timeout_hours) + young_sets = BackgroundSvSet.objects.filter( + (~Q(state="active")) & Q(date_created__lt=hours_ago) + ) + keep_ids += [s.id for s in young_sets] + + sa_table_sv = meta.tables[BackgroundSv._meta.db_table] + query_sv = delete(sa_table_sv) + if keep_ids: + query_sv = query_sv.where(sa_table_sv.c.bg_sv_set_id.not_in(keep_ids)) + get_engine().execute(query_sv) + + if keep_ids: + query_set = query_set.where(sa_table_set.c.id.not_in(keep_ids)) + else: + query_set = query_set.where(True) + get_engine().execute(query_set) + + +def cleanup_bg_sv_sets( + job: CleanupBackgroundSvSetJob, *, timeout_hours: typing.Optional[int] = None +) -> None: + """Cleanup old background SV sets""" + timeout_hours = timeout_hours or settings.SV_CLEANUP_BUILDING_SV_SETS + job.mark_start() + timeline = get_backend_api("timeline_backend") + if timeline: + tl_event = timeline.add_event( + project=None, + app_name="svs", + user=None, + event_name="svs_cleanup_bg_sv_sets", + description="cleanup background sv set", + status_type="INIT", + ) + try: + job.add_log_entry("Starting cleanup of background SVs...") + _cleanup_bg_sv_sets(job, timeout_hours=timeout_hours) + job.add_log_entry("... done cleaning up background SVs.") + except Exception as e: + job.mark_error(e) + if timeline: + tl_event.set_status("FAILED", "failed to clean up background SVs") + raise + else: + job.mark_success() + if timeline: + tl_event.set_status("OK", "cleaning up background SVs complete") diff --git a/svs/forms.py b/svs/forms.py index 02ddd8c10..462b47309 100644 --- a/svs/forms.py +++ b/svs/forms.py @@ -100,62 +100,7 @@ "negative": "negative", } -SV_DATABASES = ("DGV", "DGV GS", "ExAC", "gnomAD", "dbVar", "G1K") - - -class SvAnalysisCollectiveFrequencyMixin: - """Mixin for the frequency fields of the structural variant filtration form.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.fields["collective_enabled"] = forms.BooleanField( - label="", required=False, initial=True - ) - self.fields["cohort_background_carriers_min"] = forms.IntegerField( - label="", - min_value=0, - required=False, - widget=forms.TextInput( - attrs={"placeholder": "Minimal background carriers in analysis collective"} - ), - ) - self.fields["cohort_background_carriers_max"] = forms.IntegerField( - label="", - initial=5, - min_value=0, - required=False, - widget=forms.TextInput( - attrs={"placeholder": "Maximal background carriers in analysis collective"} - ), - ) - self.fields["cohort_affected_carriers_min"] = forms.IntegerField( - label="", - initial=1, - min_value=0, - required=False, - widget=forms.TextInput(attrs={"placeholder": "Minimal affected carriers in pedigree"}), - ) - self.fields["cohort_affected_carriers_max"] = forms.IntegerField( - label="", - min_value=0, - required=False, - widget=forms.TextInput(attrs={"placeholder": "Maximal affected carriers in pedigree"}), - ) - self.fields["cohort_unaffected_carriers_min"] = forms.IntegerField( - label="", - min_value=0, - required=False, - widget=forms.TextInput( - attrs={"placeholder": "Minimal unaffected carriers in pedigree"} - ), - ) - self.fields["cohort_unaffected_carriers_max"] = forms.IntegerField( - label="", - initial=None, - min_value=0, - required=False, - widget=forms.TextInput(attrs={"placeholder": "Maximal affected carriers in pedigree"}), - ) +SV_DATABASES = ("DGV", "DGV GS", "ExAC", "gnomAD", "dbVar", "G1K", "inhouse") class SvDatabaseFrequencyMixin: @@ -180,7 +125,7 @@ def __init__(self, *args, **kwargs): ) self.fields["%s_max_%s" % (key, entity)] = forms.IntegerField( label="", - initial=None, + initial=20 if key == "inhouse" else None, min_value=0, required=False, widget=forms.TextInput(attrs={"placeholder": "Maximal %s in %s" % (entity, name)}), @@ -606,9 +551,23 @@ def __init__(self, *args, **kwargs): ) +class MiscFilterFormMixin: + """Form mixin for misc fields (such as maximal number of output records)""" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.fields["result_rows_limit"] = forms.IntegerField( + label="Result row limit", + required=True, + initial=200, + help_text=("Maximal number of rows displayed."), + widget=forms.TextInput(attrs={"class": "numberInteger"}), + ) + + class FilterForm( SvDatabaseFrequencyMixin, - SvAnalysisCollectiveFrequencyMixin, SvVariantEffectFilterFormMixin, SvGenotypeFilterFormMixin, SvQualityFilterFormMixin, @@ -616,6 +575,7 @@ class FilterForm( GenomicRegionFilterFormMixin, SvIntervalsFilterFormMixin, RegulatoryFilterFormMixin, + MiscFilterFormMixin, forms.Form, ): """This form is used for filtering structural variants of a single case.""" diff --git a/svs/management/commands/svs_bg_sv_set_build.py b/svs/management/commands/svs_bg_sv_set_build.py new file mode 100644 index 000000000..834b6f2d6 --- /dev/null +++ b/svs/management/commands/svs_bg_sv_set_build.py @@ -0,0 +1,25 @@ +"""Django management command for building a new ``BackgroundSv`` record""" + +from django.core.management import BaseCommand + +from svs import tasks + + +class Command(BaseCommand): + """Create a new background sv set""" + + #: Help message displayed on the command line. + help = "Create a new background sv set" + + def add_arguments(self, parser): + """Add the command's argument to the ``parser``.""" + parser.add_argument( + "--chromosome", help="Chromosome to dump for", required=False, default=None + ) + + def handle(self, *args, **options): + """The actual implementation is in ``_handle()``, splitting to get commit times.""" + extra_args = {} + if options["chromosome"] is not None: + extra_args["chromosomes"] = [options["chromosome"]] + tasks.build_bg_sv_set_task(log_to_stderr=True, **extra_args) diff --git a/svs/management/commands/svs_bg_sv_set_cleanup.py b/svs/management/commands/svs_bg_sv_set_cleanup.py new file mode 100644 index 000000000..1edd44e80 --- /dev/null +++ b/svs/management/commands/svs_bg_sv_set_cleanup.py @@ -0,0 +1,29 @@ +"""Django management for cleaning existing ``BackgroundSvSet`` records.""" + +from django.conf import settings +from django.core.management import BaseCommand + +from svs import tasks + + +class Command(BaseCommand): + """Create a cleaning up background sv set records""" + + #: Help message displayed on the command line. + help = "Cleanup background sv sets (keep 2 latest active)" + + def add_arguments(self, parser): + """Add the command's argument to the ``parser``.""" + parser.add_argument( + "--timeout", + help="Timeout (in hours) to allow in 'building' state, -1 to disable", + type=int, + required=False, + default=settings.SV_CLEANUP_BUILDING_SV_SETS, + ) + + def handle(self, *args, **options): + """The actual implementation is in ``_handle()``, splitting to get commit times.""" + self.stderr.write("Removing background svs") + tasks.cleanup_bg_sv_set_task(timeout_hours=options["timeout"]) + self.stderr.write(self.style.SUCCESS(f"All done, have a nice day!")) diff --git a/svs/management/commands/svs_bg_sv_set_dump.py b/svs/management/commands/svs_bg_sv_set_dump.py new file mode 100644 index 000000000..b79399cde --- /dev/null +++ b/svs/management/commands/svs_bg_sv_set_dump.py @@ -0,0 +1,35 @@ +"""Django management command for dumping a ``BackgroundSvSet`` as BED file.""" + +from django.core.management import BaseCommand +from sqlalchemy import select + +from svs.models import BackgroundSvSet, BackgroundSv +from variants.helpers import get_meta, get_engine + + +class Command(BaseCommand): + """Dump a single ``BackgroundSvSet`` as BED file. + """ + + #: Help message displayed on the command line. + help = "List existing background SV sets" + + def add_arguments(self, parser): + """Add the command's argument to the ``parser``.""" + parser.add_argument("bg_sv_set_uuid", help="UUID of bg sv set to dump", default="GRCh37") + + def handle(self, *args, **options): + """The actual implementation is in ``_handle()``, splitting to get commit times.""" + self.stderr.write("Obtaining SV set") + bg_sv_set = BackgroundSvSet.objects.get(sodar_uuid=options["bg_sv_set_uuid"]) + self.stderr.write("Dumping SV set to BED file") + meta = get_meta() + sa_table = meta.tables[BackgroundSv._meta.db_table] + query = ( + select(sa_table.c.chromosome, sa_table.c.start, sa_table.c.end, sa_table.c.sv_type) + .select_from(sa_table) + .where(sa_table.c.bg_sv_set_id == bg_sv_set.id) + ) + for row in get_engine().execute(query): + print("\t".join(map(str, [row.chromosome, row.start - 1, row.end, row.sv_type]))) + self.stderr.write(self.style.SUCCESS(f"All done, have a nice day!")) diff --git a/svs/management/commands/svs_bg_sv_set_list.py b/svs/management/commands/svs_bg_sv_set_list.py new file mode 100644 index 000000000..41e3c8c60 --- /dev/null +++ b/svs/management/commands/svs_bg_sv_set_list.py @@ -0,0 +1,28 @@ +"""Django management command for listing ``BackgroundSvSet`` records.""" + +from django.core.management import BaseCommand +from prettytable import PrettyTable + + +from svs.models import BackgroundSvSet + + +class Command(BaseCommand): + """List all existing ``BackgroundSvSet`` records + """ + + #: Help message displayed on the command line. + help = "List existing background SV sets" + + def add_arguments(self, parser): + """Add the command's argument to the ``parser``.""" + + def handle(self, *args, **options): + """The actual implementation is in ``_handle()``, splitting to get commit times.""" + table = PrettyTable() + table.field_names = ["id", "UUID", "created", "state"] + for bg_sv_set in BackgroundSvSet.objects.order_by("date_created"): + table.add_row( + [bg_sv_set.pk, bg_sv_set.sodar_uuid, bg_sv_set.date_created, bg_sv_set.state] + ) + print(table) diff --git a/svs/management/commands/svs_sv_fill_nulls.py b/svs/management/commands/svs_sv_fill_nulls.py index ce79f83fe..65de2fd3a 100644 --- a/svs/management/commands/svs_sv_fill_nulls.py +++ b/svs/management/commands/svs_sv_fill_nulls.py @@ -9,6 +9,7 @@ from tqdm import tqdm from svs.models import StructuralVariant +from svs import bg_db from variants.models import Case @@ -40,29 +41,6 @@ def handle(self, *args, **options): record.chromosome_no2 = record.chromosome_no record.bin2 = record.bin - record.num_hom_alt = 0 - record.num_hom_ref = 0 - record.num_het = 0 - record.num_hemi_alt = 0 - record.num_hemi_ref = 0 - for k, v in record.genotype.items(): - gt = v["gt"] - k_sex = sex.get(record.case_id, {}).get(k, 0) - if gt == "1": - record.num_hemi_alt += 1 - elif gt == "0": - record.num_hemi_ref += 1 - elif gt in ("0/1", "1/0", "0|1", "1|0"): - record.num_het += 1 - elif gt in ("0/0", "0|0"): - if "x" in record.chromosome.lower() and k_sex == 1: - record.num_hemi_ref += 1 - else: - record.num_hom_ref += 1 - elif gt in ("1|1", "1|1"): - if "x" in record.chromosome.lower() and k_sex == 1: - record.num_hemi_alt += 1 - else: - record.num_hom_alt += 1 + bg_db._fill_null_counts(record, sex) record.save() self.stderr.write("... all done, have a nice day!") diff --git a/svs/management/commands/svs_svs_dump.py b/svs/management/commands/svs_svs_dump.py new file mode 100644 index 000000000..7890401e7 --- /dev/null +++ b/svs/management/commands/svs_svs_dump.py @@ -0,0 +1,67 @@ +"""Django management command for dumping StructuralVariant records as BED files""" + +import sys + +from django.core.management import BaseCommand + +from svs.models import StructuralVariant + + +class Command(BaseCommand): + """Dump all SVs as a BED file. + + Note that the resulting file will not be sorted by (chrom, begin) so the file needs to be reprocessed. + """ + + #: Help message displayed on the command line. + help = "Dump all SVS as a BED file" + + def add_arguments(self, parser): + """Add the command's argument to the ``parser``.""" + parser.add_argument( + "--release", help="The genome build to dump for", required=False, default="GRCh37" + ) + parser.add_argument( + "--chromosome", help="Chromosome to dump for", required=False, default=None + ) + parser.add_argument( + "--output-file", + help="File to write to, leave blank to write to stdout", + required=False, + default=None, + ) + + def handle(self, *args, **options): + """The actual implementation is in ``_handle()``, splitting to get commit times.""" + if options.get("output_file"): + self.stderr.write(f"Writing to {options[output_file]}") + with open(options["output_file"], "wt") as outputf: + record_count = self._handle(options, outputf) + else: + self.stderr.write("Writing to stdout") + record_count = self._handle(options, sys.stdout) + self.stderr.write(self.style.SUCCESS(f"All done, wrote {record_count} records")) + + def _handle(self, options, outputf): + record_count = 0 + filter_args = {"release": options["release"]} + if options["chromosome"] is not None: + filter_args["chromosome"] = options["chromosome"] + for record_count, sv_record in enumerate( + StructuralVariant.objects.filter(**filter_args).iterator() + ): + print( + "\t".join( + map( + str, + [ + sv_record.chromosome, + sv_record.start - 1, + sv_record.end, + f"{sv_record.sv_type}--{sv_record.case_id}", + ], + ) + ), + file=outputf, + ) + return record_count diff --git a/svs/migrations/0017_backgroundsv_backgroundsvset_buildbackgroundsvsetjob_cleanupbackgroundsvsetjob.py b/svs/migrations/0017_backgroundsv_backgroundsvset_buildbackgroundsvsetjob_cleanupbackgroundsvsetjob.py new file mode 100644 index 000000000..303057276 --- /dev/null +++ b/svs/migrations/0017_backgroundsv_backgroundsvset_buildbackgroundsvsetjob_cleanupbackgroundsvsetjob.py @@ -0,0 +1,186 @@ +# Generated by Django 3.2.12 on 2022-06-27 13:40 + +import bgjobs.models +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ("bgjobs", "0006_auto_20200526_1657"), + ("svs", "0016_structuralvariantset_release"), + ] + + operations = [ + migrations.CreateModel( + name="BackgroundSvSet", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "sodar_uuid", + models.UUIDField( + default=uuid.uuid4, help_text="Record SODAR UUID", unique=True + ), + ), + ( + "date_created", + models.DateTimeField(auto_now_add=True, help_text="DateTime of creation"), + ), + ( + "date_modified", + models.DateTimeField(auto_now=True, help_text="DateTime of last modification"), + ), + ("genomebuild", models.CharField(default="GRCh37", max_length=32)), + ( + "varfish_version", + models.CharField( + blank=True, + help_text="Version of varfish server used to build the set", + max_length=32, + null=True, + ), + ), + ( + "state", + models.CharField( + choices=[ + ("initial", "initial"), + ("building", "build in progress"), + ("ready", "ready but not active"), + ("active", "active"), + ("inactive", "inactive"), + ("deleting", "deletion in progress"), + ], + default="initial", + max_length=32, + ), + ), + ], + ), + migrations.CreateModel( + name="CleanupBackgroundSvSetJob", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "date_created", + models.DateTimeField(auto_now_add=True, help_text="DateTime of creation"), + ), + ( + "sodar_uuid", + models.UUIDField(default=uuid.uuid4, help_text="Case SODAR UUID", unique=True), + ), + ( + "bg_job", + models.ForeignKey( + help_text="Background job for state etc.", + on_delete=django.db.models.deletion.CASCADE, + related_name="svs_cleanupbackgroundsvsetjob_related", + to="bgjobs.backgroundjob", + ), + ), + ], + options={"ordering": ("-date_created",), "abstract": False,}, + bases=(bgjobs.models.JobModelMessageMixin, models.Model), + ), + migrations.CreateModel( + name="BuildBackgroundSvSetJob", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ( + "date_created", + models.DateTimeField(auto_now_add=True, help_text="DateTime of creation"), + ), + ( + "sodar_uuid", + models.UUIDField(default=uuid.uuid4, help_text="Case SODAR UUID", unique=True), + ), + ("genomebuild", models.CharField(default="GRCh37", max_length=32)), + ( + "bg_job", + models.ForeignKey( + help_text="Background job for state etc.", + on_delete=django.db.models.deletion.CASCADE, + related_name="svs_buildbackgroundsvsetjob_related", + to="bgjobs.backgroundjob", + ), + ), + ], + options={"ordering": ("-date_created",), "abstract": False,}, + bases=(bgjobs.models.JobModelMessageMixin, models.Model), + ), + migrations.CreateModel( + name="BackgroundSv", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, primary_key=True, serialize=False, verbose_name="ID" + ), + ), + ("release", models.CharField(max_length=32)), + ("chromosome", models.CharField(max_length=32)), + ("chromosome_no", models.IntegerField()), + ("start", models.IntegerField()), + ("chromosome2", models.CharField(max_length=32)), + ("chromosome_no2", models.IntegerField()), + ("end", models.IntegerField()), + ( + "pe_orientation", + models.CharField( + blank=True, + choices=[ + ("3to3", "3to3"), + ("3to5", "3to5"), + ("5to3", "5to3"), + ("5to5", "5to5"), + ], + max_length=4, + null=True, + ), + ), + ( + "sv_type", + models.CharField( + choices=[ + ("DEL", "deletion"), + ("DUP", "duplication"), + ("INS", "insertion"), + ("INV", "inversion"), + ("BND", "breakend"), + ("CNV", "copy number variation"), + ], + max_length=32, + ), + ), + ("bin", models.IntegerField()), + ("src_count", models.IntegerField()), + ("carriers", models.IntegerField()), + ("carriers_het", models.IntegerField()), + ("carriers_hom", models.IntegerField()), + ("carriers_hemi", models.IntegerField()), + ( + "bg_sv_set", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, to="svs.backgroundsvset" + ), + ), + ], + ), + ] diff --git a/svs/migrations/0018_extend_structuralvariant.py b/svs/migrations/0018_extend_structuralvariant.py index 1d5abf4a0..2141629ea 100644 --- a/svs/migrations/0018_extend_structuralvariant.py +++ b/svs/migrations/0018_extend_structuralvariant.py @@ -92,7 +92,19 @@ ADD num_het integer NULL, ADD num_hemi_alt integer NULL, ADD num_hemi_ref integer NULL; - """ + """, + r""" + ALTER TABLE svs_structuralvariant + DROP COLUMN chromosome2, + DROP COLUMN chromosome_no2, + DROP COLUMN bin2, + DROP COLUMN pe_orientation, + DROP COLUMN num_hom_alt, + DROP COLUMN num_hom_ref, + DROP COLUMN num_het, + DROP COLUMN num_hemi_alt, + DROP COLUMN num_hemi_ref; + """, ), ] diff --git a/svs/migrations/0019_backgroundsv_svs_backgro_bg_sv_s_72b34f_idx.py b/svs/migrations/0019_backgroundsv_svs_backgro_bg_sv_s_72b34f_idx.py new file mode 100644 index 000000000..a6ab44bde --- /dev/null +++ b/svs/migrations/0019_backgroundsv_svs_backgro_bg_sv_s_72b34f_idx.py @@ -0,0 +1,20 @@ +# Generated by Django 3.2.12 on 2022-07-01 09:40 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("svs", "0018_extend_structuralvariant"), + ] + + operations = [ + migrations.AddIndex( + model_name="backgroundsv", + index=models.Index( + fields=["bg_sv_set_id", "release", "chromosome", "bin"], + name="svs_backgro_bg_sv_s_72b34f_idx", + ), + ), + ] diff --git a/svs/models.py b/svs/models.py index 1bf84b85a..deafbb29a 100644 --- a/svs/models.py +++ b/svs/models.py @@ -19,7 +19,7 @@ from projectroles.plugins import get_backend_api from sqlalchemy import and_ -from variants.models import Case, VARIANT_RATING_CHOICES, VariantImporterBase +from variants.models import Case, VARIANT_RATING_CHOICES, VariantImporterBase, SiteBgJobBase from variants.helpers import get_meta #: Django user model. @@ -616,3 +616,134 @@ def run_import_structural_variants_bg_job(pk): % (import_job.case_name, elapsed.total_seconds()), status_type="OK", ) + + +class BackgroundSvSetManager(models.Manager): + def latest_active_if_any(self): # -> typing.Optional[BackgroundSvSet] + qs = self.filter(state="active").order_by("-date_created") + if qs.count(): + return qs[0] + else: + return None + + +class BackgroundSvSet(models.Model): + """A set of ``BackgroundSv`` records.""" + + objects = BackgroundSvSetManager() + + #: Record UUID. + sodar_uuid = models.UUIDField( + default=uuid_object.uuid4, unique=True, help_text="Record SODAR UUID" + ) + + #: DateTime of creation + date_created = models.DateTimeField(auto_now_add=True, help_text="DateTime of creation") + #: DateTime of last modification + date_modified = models.DateTimeField(auto_now=True, help_text="DateTime of last modification") + + #: Release of genomebuild + genomebuild = models.CharField(max_length=32, default="GRCh37") + #: VarFish version used for building the set. + varfish_version = models.CharField( + max_length=32, + blank=True, + null=True, + help_text="Version of varfish server used to build the set", + ) + #: Status of the set (goes linearly through the states). + state = models.CharField( + max_length=32, + choices=( + ("initial", "initial"), + ("building", "build in progress"), + ("ready", "ready but not active"), + ("active", "active"), + ("inactive", "inactive"), + ("deleting", "deletion in progress"), + ), + default="initial", + ) + + +class BackgroundSv(models.Model): + """A structural variant background record.""" + + #: The ``BackgroundSvSet`` that this record belongs to. + bg_sv_set = models.ForeignKey(to=BackgroundSvSet, on_delete=models.CASCADE) + + #: Genome build + release = models.CharField(max_length=32) + #: Variant coordinates - chromosome + chromosome = models.CharField(max_length=32) + #: Chromosome as number + chromosome_no = models.IntegerField() + #: Variant coordinates - start position + start = models.IntegerField() + #: Variant coordinates - chromosome of end position + chromosome2 = models.CharField(max_length=32) + #: Chromosome as number (of end position) + chromosome_no2 = models.IntegerField() + #: Variant coordinates - end position + end = models.IntegerField() + #: Paried-end orientation + pe_orientation = models.CharField( + max_length=4, + choices=(("3to3", "3to3"), ("3to5", "3to5"), ("5to3", "5to3"), ("5to5", "5to5")), + blank=True, + null=True, + ) + + #: The type of the structural variant. + sv_type = models.CharField(max_length=32, choices=SV_TYPE_CHOICES) + #: The bin for indexing. + bin = models.IntegerField() + + #: Number of source records + src_count = models.IntegerField() + #: Number of carriers + carriers = models.IntegerField() + #: Number of het. carriers + carriers_het = models.IntegerField() + #: Number of hom. carriers + carriers_hom = models.IntegerField() + #: Number of hemizygous carriers + carriers_hemi = models.IntegerField() + + class Meta: + indexes = (models.Index(fields=["bg_sv_set_id", "release", "chromosome", "bin"]),) + + +class BuildBackgroundSvSetJob(SiteBgJobBase): + """Background job for building a SV background set.""" + + #: Task description for logging. + task_desc = 'Refreshing background SV set (aka "in-house database")' + + #: String identifying model in BackgroundJob. + spec_name = "svs.build_bg_sv_set" + + #: The release to build the background sv set for. + genomebuild = models.CharField(max_length=32, default="GRCh37") + + def get_human_readable_type(self): + return "Build background SV set" + + def get_absolute_url(self): + return reverse("svs:build-bg-sv-set-job-detail", kwargs={"job": self.sodar_uuid},) + + +class CleanupBackgroundSvSetJob(SiteBgJobBase): + """Background job cleaning up building background SV sets.""" + + #: Task description for logging. + task_desc = "Cleaning up building background SV sets" + + #: String identifying model in BackgroundJob. + spec_name = "svs.cleanup_bg_sv_sets" + + def get_human_readable_type(self): + return "Cleanup building background SV set" + + def get_absolute_url(self): + return reverse("svs:cleanup-bg-sv-set-job-detail", kwargs={"job": self.sodar_uuid},) diff --git a/svs/plugins.py b/svs/plugins.py index 35cdf589a..9836e89a2 100644 --- a/svs/plugins.py +++ b/svs/plugins.py @@ -6,6 +6,8 @@ StructuralVariantComment, StructuralVariantFlags, ImportStructuralVariantBgJob, + BuildBackgroundSvSetJob, + CleanupBackgroundSvSetJob, ) from .urls import urlpatterns @@ -65,7 +67,11 @@ class BackgroundJobsPlugin(BackgroundJobsPluginPoint): title = "SVs Background Jobs" #: Return name-to-class mapping for background job class specializations. - job_specs = {ImportStructuralVariantBgJob.spec_name: ImportStructuralVariantBgJob} + job_specs = { + ImportStructuralVariantBgJob.spec_name: ImportStructuralVariantBgJob, + BuildBackgroundSvSetJob.spec_name: BuildBackgroundSvSetJob, + CleanupBackgroundSvSetJob.spec_name: CleanupBackgroundSvSetJob, + } def get_extra_data_link(self, _extra_data, _name): """Return a link for timeline label starting with 'extra-'""" diff --git a/svs/queries.py b/svs/queries.py index 3c4884c07..95e5bb2e8 100644 --- a/svs/queries.py +++ b/svs/queries.py @@ -1,6 +1,7 @@ """Building queries for structural variants based on the ``QueryParts`` infrastructure from ``variants``.""" import enum +import logging import operator import sys from itertools import chain @@ -21,6 +22,8 @@ StructuralVariantComment, StructuralVariantFlags, StructuralVariantSet, + BackgroundSv, + BackgroundSvSet, ) from genomicfeatures.models import ( EnsemblRegulatoryFeature, @@ -40,6 +43,9 @@ ExtendQueryPartsCaseJoinAndFilter as _ExtendQueryPartsCaseJoinAndFilter, ) +#: Logger to use in this module. +LOGGER = logging.getLogger(__name__) + def overlaps(lhs, rhs, min_overlap=None, rhs_start=None, rhs_end=None, padding=None): """Returns term of ``lhs`` overlapping with ``rhs`` based on the start/end fields. @@ -117,7 +123,10 @@ def structural_variant_query(_self, kwargs): ], selectable=StructuralVariant.sa.table.outerjoin( StructuralVariantGeneAnnotation.sa.table, - StructuralVariantGeneAnnotation.sa.sv_uuid == StructuralVariant.sa.sv_uuid, + and_( + StructuralVariantGeneAnnotation.sa.sv_uuid == StructuralVariant.sa.sv_uuid, + StructuralVariantGeneAnnotation.sa.case_id == StructuralVariant.sa.case_id, + ), ), conditions=[], ) @@ -316,6 +325,7 @@ class ExtendQueryPartsPublicDatabaseFrequencyJoinAndFilter(ExtendQueryPartsBase) ("exac", ExacCnv, func.count()), ("dbvar", DbVarSv, func.sum(DbVarSv.sa.num_carriers)), ("gnomad", GnomAdSv, func.sum(GnomAdSv.sa.n_het + GnomAdSv.sa.n_homalt)), + ("inhouse", BackgroundSv, func.sum(BackgroundSv.sa.carriers)), ) def __init__(self, *args, **kwargs): @@ -333,22 +343,29 @@ def _build_subqueries(self): model_start = model.sa.start model_end = model.sa.end min_overlap = float(self.kwargs.get("%s_min_overlap" % token, "0.75")) + where_terms = [ + # TODO: type mapping -- interesting/necessary? + # StructuralVariant.sa.sv_type == model.sa.sv_type, + overlaps( + StructuralVariant, + model, + min_overlap=min_overlap, + rhs_start=model_start, + rhs_end=model_end, + ), + ] + if token == "inhouse": + latest_set = BackgroundSvSet.objects.latest_active_if_any() + if not latest_set: + LOGGER.warning("Could not find a latest BackgroundSvSet to use") + continue # don't have a set + else: + LOGGER.info("Using latest BackgroundSvSet: %s" % latest_set.sodar_uuid) + where_terms.append(model.sa.bg_sv_set_id == latest_set.pk) result[token] = ( select([observed_events.label("observed_events")]) .select_from(model.sa) - .where( - and_( - # TODO: type mapping -- interesting/necessary? - # StructuralVariant.sa.sv_type == model.sa.sv_type, - overlaps( - StructuralVariant, - model, - min_overlap=min_overlap, - rhs_start=model_start, - rhs_end=model_end, - ) - ) - ) + .where(and_(*where_terms)) .alias("subquery_%s_inner" % token) ).lateral("subquery_%s_outer" % token) return result @@ -357,8 +374,9 @@ def _build_fields(self, subqueries): # NB: subqueries is given as a parameter here to highlight the dependency between the two helper functions fields = {} for token, _, observed_events in self.TOKEN_MODEL_FIELD: - field = func.coalesce(subqueries[token].c.observed_events, 0) - fields["%s_overlap_count" % token] = field.label("%s_overlap_count" % token) + if token in subqueries: # e.g., "inhouse" might not exist yet + field = func.coalesce(subqueries[token].c.observed_events, 0) + fields["%s_overlap_count" % token] = field.label("%s_overlap_count" % token) return fields def extend_fields(self, _query_parts): @@ -482,29 +500,6 @@ def extend_selectable(self, query_parts): return query_parts.selectable.outerjoin(self.subquery, true()) -class ExtendQueryPartsInHouseDatabaseFilter(ExtendQueryPartsBase): - """Extend ``QueryParts`` for filtering for in-house database occurence.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def extend_conditions(self, _query_parts): - result = [] - for token in ("affected", "unaffected", "background"): - for minmax in ("min", "max"): - if ( - self.kwargs["collective_enabled"] - and self.kwargs["cohort_%s_carriers_%s" % (token, minmax)] is not None - ): - field = StructuralVariant.sa.info["%sCarriers" % token].astext.cast(Integer) - thresh = self.kwargs["cohort_%s_carriers_%s" % (token, minmax)] - term = field >= thresh if minmax == "min" else field <= thresh - result.append( - or_(StructuralVariant.sa.info["%sCarriers" % token].is_(None), term) - ) - return result - - class ExtendQueryPartsVariantEffectFilter(ExtendQueryPartsBase): """Extend ``QueryParts`` for filtering for variant effect, whether a transcript overlap is required, and whether coding and/or non-coding transcripts are accepted. @@ -948,7 +943,6 @@ def extend_selectable(self, query_parts): ExtendQueryPartsPublicDatabaseFrequencyJoinAndFilter, ExtendQueryPartsFlagsJoinAndFilter, ExtendQueryPartsCommentsJoinAndFilter, - ExtendQueryPartsInHouseDatabaseFilter, ExtendQueryPartsVariantEffectFilter, ExtendQueryPartsGenesJoinAndFilter, ExtendQueryPartsTadBoundaryDistanceJoin, @@ -973,7 +967,13 @@ def __init__(self, case, engine, query_id=None): self.query_id = query_id def run(self, kwargs): - order_by = [column("chromosome"), column("start"), column("end"), column("sv_type")] + order_by = [ + column("chromosome"), + column("start"), + column("end"), + column("sv_type"), + column("sv_uuid"), + ] stmt = ( self.builder(self.case_or_cases, self.query_id).run(kwargs).to_stmt(order_by=order_by) ) @@ -986,7 +986,6 @@ def run(self, kwargs): ), file=sys.stderr, ) - return self.engine.execute(stmt) diff --git a/svs/tasks.py b/svs/tasks.py index ddb55643c..7a103cde1 100644 --- a/svs/tasks.py +++ b/svs/tasks.py @@ -1,7 +1,17 @@ +from django.conf import settings +from django.contrib.auth import get_user_model +from django.db import transaction + from config.celery import app from celery.schedules import crontab -from . import models +from bgjobs.models import BackgroundJob +from svs import models, bg_db + +#: The User model to use. +from svs.models import BuildBackgroundSvSetJob, CleanupBackgroundSvSetJob + +User = get_user_model() @app.task(bind=True) @@ -16,6 +26,36 @@ def run_import_structural_variants_bg_job(_self, import_variants_bg_job_pk): return models.run_import_structural_variants_bg_job(pk=import_variants_bg_job_pk) +@app.task(bind=True) +def build_bg_sv_set_task(_self, *, log_to_stderr=False, chromosomes=None): + """Rebuild the background SV set""" + with transaction.atomic(): + generic_bg_job = BackgroundJob.objects.create( + name="build background sv set", + project=None, + job_type="svs.build_bg_sv_set", + user=User.objects.get(username=settings.PROJECTROLES_ADMIN_OWNER), + ) + spec_bg_job = BuildBackgroundSvSetJob.objects.create(bg_job=generic_bg_job) + + bg_db.build_bg_sv_set(spec_bg_job, log_to_stderr=log_to_stderr, chromosomes=chromosomes) + + +@app.task(bind=True) +def cleanup_bg_sv_set_task(_self, *, timeout_hours=None): + """Cleanup old background SV sets""" + with transaction.atomic(): + generic_bg_job = BackgroundJob.objects.create( + name="cleanup background sv sets", + project=None, + job_type="svs.cleanup_bg_sv_sets", + user=User.objects.get(username=settings.PROJECTROLES_ADMIN_OWNER), + ) + spec_bg_job = CleanupBackgroundSvSetJob.objects.create(bg_job=generic_bg_job) + + bg_db.cleanup_bg_sv_sets(spec_bg_job, timeout_hours=timeout_hours) + + @app.on_after_finalize.connect def setup_periodic_tasks(sender, **_kwargs): """Register periodic tasks""" @@ -23,3 +63,6 @@ def setup_periodic_tasks(sender, **_kwargs): sender.add_periodic_task( schedule=crontab(minute=11), sig=clear_inactive_structural_variant_sets.s() ) + # Regularly create new bg sv set and clean up old ones + sender.add_periodic_task(schedule=crontab(day_of_week="sunday", sig=build_bg_sv_set_task.s())) + sender.add_periodic_task(schedule=crontab(day_of_week="sunday", sig=cleanup_bg_sv_set_task.s())) diff --git a/svs/templates/svs/build_bg_job_detail.html b/svs/templates/svs/build_bg_job_detail.html new file mode 100644 index 000000000..2e992587e --- /dev/null +++ b/svs/templates/svs/build_bg_job_detail.html @@ -0,0 +1,99 @@ +{% extends 'projectroles/project_base.html' %} +{% load projectroles_common_tags %} +{% load rules %} +{% load crispy_forms_tags %} +{% load dict %} +{% load humanize %} +{% load static %} +{% load json %} + +{% block title %} + Build Background SV Set Job: {{ object.bg_job.name }} +{% endblock title %} + +{% block navi_sub_project_extend %} + + + +{% endblock %} + +{% block projectroles %} +
+ {# Project menu dropdown, only visible if browser width < X and sidebar is hidden #} + {% include 'projectroles/_project_menu_btn.html' %} + +

+ Background Import Job + {{ object.bg_job.name }} +

+ {% include 'variants/_job_view_buttons.html' with project=project folder=folder %} +
+ +
+ +
+
+

+ Details +

+
+
+
    +
  • +
    Created
    +
    {{ object.bg_job.date_created }}
    +
  • +
  • +
    Updated
    +
    {{ object.bg_job.date_modified }}
    +
  • +
  • +
    Creator
    +
    {{ object.bg_job.user }}
    +
  • +
  • +
    Title
    +
    {{ object.bg_job.name }}
    +
  • +
  • +
    Description
    +
    {{ object.bg_job.description|default:"-" }}
    +
  • +
  • +
    Status
    +
    {{ object.bg_job.status }}
    +
  • +
+
+
+ +
+
+

+ Log Message +

+
+
+
    + {% if object.bg_job.log_entries.all %} + {% for entry in object.bg_job.log_entries.all %} +
  • +
    {{ entry.date_created }}
    +
    {{ entry.level }}
    +
    {{ entry.message }}
    +
  • + {% endfor %} + {% else %} +
  • + No log entries yet. +
  • + {% endif %} +
+
+
+ +
+ +{% endblock %} diff --git a/svs/templates/svs/cleanup_bg_job_detail.html b/svs/templates/svs/cleanup_bg_job_detail.html new file mode 100644 index 000000000..b37b55c5b --- /dev/null +++ b/svs/templates/svs/cleanup_bg_job_detail.html @@ -0,0 +1,99 @@ +{% extends 'projectroles/project_base.html' %} +{% load projectroles_common_tags %} +{% load rules %} +{% load crispy_forms_tags %} +{% load dict %} +{% load humanize %} +{% load static %} +{% load json %} + +{% block title %} + Cleanup Background SV Set Job: {{ object.bg_job.name }} +{% endblock title %} + +{% block navi_sub_project_extend %} + + + +{% endblock %} + +{% block projectroles %} +
+ {# Project menu dropdown, only visible if browser width < X and sidebar is hidden #} + {% include 'projectroles/_project_menu_btn.html' %} + +

+ Background Import Job + {{ object.bg_job.name }} +

+ {% include 'variants/_job_view_buttons.html' with project=project folder=folder %} +
+ +
+ +
+
+

+ Details +

+
+
+
    +
  • +
    Created
    +
    {{ object.bg_job.date_created }}
    +
  • +
  • +
    Updated
    +
    {{ object.bg_job.date_modified }}
    +
  • +
  • +
    Creator
    +
    {{ object.bg_job.user }}
    +
  • +
  • +
    Title
    +
    {{ object.bg_job.name }}
    +
  • +
  • +
    Description
    +
    {{ object.bg_job.description|default:"-" }}
    +
  • +
  • +
    Status
    +
    {{ object.bg_job.status }}
    +
  • +
+
+
+ +
+
+

+ Log Message +

+
+
+
    + {% if object.bg_job.log_entries.all %} + {% for entry in object.bg_job.log_entries.all %} +
  • +
    {{ entry.date_created }}
    +
    {{ entry.level }}
    +
    {{ entry.message }}
    +
  • + {% endfor %} + {% else %} +
  • + No log entries yet. +
  • + {% endif %} +
+
+
+ +
+ +{% endblock %} diff --git a/svs/templates/svs/filter_form/frequency.html b/svs/templates/svs/filter_form/frequency.html index e30e1c004..a2edaf30a 100644 --- a/svs/templates/svs/filter_form/frequency.html +++ b/svs/templates/svs/filter_form/frequency.html @@ -59,50 +59,11 @@
{{ form.gnomad_min_overlap|as_crispy_field }} {{ form.gnomad_max_carriers|as_crispy_field }} - - - - - -
-
-
- Frequencies in Analysis Collective -
- -

- In the following, you can configure structural variant filter setting based on counts within the collective of samples that this case was analyzed within. - Leave fields empty to exclude them from the query. -

- - - - - - - - - - - - - - - - - - - - - - - - - - + + + +
- enable / donors - min carriersmax carriers
{{ form.collective_enabled|as_crispy_field }}collective background{{ form.cohort_background_carriers_min|as_crispy_field }}{{ form.cohort_background_carriers_max|as_crispy_field }}
pedigree affected{{ form.cohort_affected_carriers_min|as_crispy_field }}{{ form.cohort_affected_carriers_max|as_crispy_field }}
pedigree unaffected{{ form.cohort_unaffected_carriers_min|as_crispy_field }}{{ form.cohort_unaffected_carriers_max|as_crispy_field }}{{ form.inhouse_enabled|as_crispy_field }}in-house background{{ form.inhouse_min_overlap|as_crispy_field }}{{ form.inhouse_max_carriers|as_crispy_field }}
diff --git a/svs/templates/svs/filter_form/misc.html b/svs/templates/svs/filter_form/misc.html index aacb22f99..22f9ac646 100644 --- a/svs/templates/svs/filter_form/misc.html +++ b/svs/templates/svs/filter_form/misc.html @@ -2,7 +2,13 @@ {% load dict %}
-
- misc +
+
+ Miscellaneous Settings +
+
+ +
+ {{ form.result_rows_limit|as_crispy_field }}
diff --git a/svs/templates/svs/filter_result/header.html b/svs/templates/svs/filter_result/header.html index c815c4166..dec451ba0 100644 --- a/svs/templates/svs/filter_result/header.html +++ b/svs/templates/svs/filter_result/header.html @@ -12,7 +12,7 @@ length {# position #} Gene(s) - Collective Count + in-house 1KG dbVar ExAC diff --git a/svs/templates/svs/filter_result/row_sv.html b/svs/templates/svs/filter_result/row_sv.html index 835018a76..b0216a011 100644 --- a/svs/templates/svs/filter_result/row_sv.html +++ b/svs/templates/svs/filter_result/row_sv.html @@ -78,14 +78,8 @@ {% endif %} - - {% spaceless %} - {{ first_entry.info.affectedCarriers }} - / - {{ first_entry.info.unaffectedCarriers }} - / - {{ first_entry.info.backgroundCarriers }} - {% endspaceless %} + + {{ first_entry.inhouse_overlap_count|default:0|intcomma }} {{ first_entry.g1k_overlap_count|default:0|intcomma }} diff --git a/svs/templates/svs/filter_result/table.html b/svs/templates/svs/filter_result/table.html index 9ba57d719..48d362dd2 100644 --- a/svs/templates/svs/filter_result/table.html +++ b/svs/templates/svs/filter_result/table.html @@ -19,8 +19,9 @@

- {{ rows_by_sv|length|intcomma }} SV calls - {% if object %}(case has a total of {{ object.num_svs|intcomma }}{% endif %} SV calls) + Displaying {{ rows_by_sv|length|intcomma }} SV calls. + {% if num_results >= results_limit %}(truncated, total in result: {{ num_results }}){% endif %} + {% if object %}Case has a total of {{ object.num_svs|intcomma }} SV calls.{% endif %}
Using diff --git a/svs/tests/factories.py b/svs/tests/factories.py index 51c7d1c5a..8a7651446 100644 --- a/svs/tests/factories.py +++ b/svs/tests/factories.py @@ -17,6 +17,8 @@ StructuralVariantFlags, StructuralVariantComment, StructuralVariantSet, + BackgroundSv, + BackgroundSvSet, ) import typing import attr @@ -52,11 +54,11 @@ class Params: release = "GRCh37" chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) chromosome_no = factory.Iterator(list(range(1, 25))) - chromosome2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) - chromosome_no2 = factory.Iterator(list(range(1, 25))) + chromosome2 = factory.LazyAttribute(lambda obj: obj.chromosome) + chromosome_no2 = factory.LazyAttribute(lambda obj: obj.chromosome_no) start = factory.Sequence(lambda n: (n + 1) * 100) end = factory.Sequence(lambda n: (n + 1) * 100 + 100) - pe_orientation = None + pe_orientation = "3to5" start_ci_left = -100 start_ci_right = 100 @@ -83,11 +85,25 @@ class Params: } ) - num_hom_alt = 0 - num_hom_ref = 0 - num_het = 0 - num_hemi_alt = 0 - num_hemi_ref = 0 + @factory.lazy_attribute + def num_hom_alt(self): + return len([k for k, v in self.genotype.items() if v["gt"] in ("1/1", "1|1")]) + + @factory.lazy_attribute + def num_hom_ref(self): + return len([k for k, v in self.genotype.items() if v["gt"] in ("0/0", "0|0")]) + + @factory.lazy_attribute + def num_het(self): + return len([k for k, v in self.genotype.items() if v["gt"] in ("0/1", "0|1", "1/0", "1|0")]) + + @factory.lazy_attribute + def num_hemi_ref(self): + return len([k for k, v in self.genotype.items() if v["gt"] == "0"]) + + @factory.lazy_attribute + def num_hemi_alt(self): + return len([k for k, v in self.genotype.items() if v["gt"] == "1"]) @factory.lazy_attribute def bin(self): @@ -273,3 +289,36 @@ class FormDataFactory: regulatory_vista_any_validation: bool = False regulatory_vista_positive: bool = False regulatory_vista_negative: bool = False + + +class BackgroundSvSetFactory(factory.django.DjangoModelFactory): + class Meta: + model = BackgroundSvSet + + genomebuild = "GRCh37" + varfish_version = "1.2.0" + state = "active" + + +class BackgroundSvFactory(factory.django.DjangoModelFactory): + class Meta: + model = BackgroundSv + + bg_sv_set = factory.SubFactory(BackgroundSvSetFactory,) + + release = "GRCh37" + chromosome = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) + chromosome_no = factory.Iterator(list(range(1, 25))) + start = factory.Sequence(lambda n: (n + 1) * 100) + chromosome2 = factory.Iterator(list(map(str, range(1, 23))) + ["X", "Y"]) + chromosome_no2 = factory.Iterator(list(range(1, 25))) + end = factory.Sequence(lambda n: (n + 1) * 100 + 100) + pe_orientation = "3to5" + sv_type = "DEL" + bin = factory.LazyAttribute(lambda obj: binning.assign_bin(obj.start, obj.end)) + + src_count = 1 + carriers = 1 + carriers_het = 1 + carriers_hom = 0 + carriers_hemi = 0 diff --git a/svs/tests/test_bg_db.py b/svs/tests/test_bg_db.py new file mode 100644 index 000000000..57a292b34 --- /dev/null +++ b/svs/tests/test_bg_db.py @@ -0,0 +1,562 @@ +import random +import typing + +import attrs +from bgjobs.models import BackgroundJob +from test_plus.test import TestCase + +from svs import bg_db +from django.contrib.auth import get_user_model + +#: The User model to use. +from svs.models import BuildBackgroundSvSetJob, BackgroundSvSet, BackgroundSv + +User = get_user_model() + +from svs.tests.factories import StructuralVariantFactory + + +class TestSvTypes(TestCase): + def testSmokeTest(self): + self.assertIn("DEL", bg_db.SV_TYPES) + + +class TestClusterAlgoParams(TestCase): + def testConstructionDefaultValues(self): + params = bg_db.ClusterAlgoParams() + self.assertEqual( + str(params), + "ClusterAlgoParams(seed=42, cluster_max_size=500, cluster_size_sample_to=100, " + "min_jaccard_overlap=0.7, bnd_slack=50)", + ) + + +class TestFileNamSafe(TestCase): + def testSimple(self): + self.assertEqual(bg_db._file_name_safe("This is a string /"), "This_is_a_string__") + + +class TestPairedEndOrientation(TestCase): + def testValues(self): + self.assertEqual(bg_db.PairedEndOrientation.FIVE_TO_THREE.value, "5to3") + self.assertEqual(bg_db.PairedEndOrientation.THREE_TO_FIVE.value, "3to5") + self.assertEqual(bg_db.PairedEndOrientation.THREE_TO_THREE.value, "3to3") + self.assertEqual(bg_db.PairedEndOrientation.FIVE_TO_FIVE.value, "5to5") + + +class TestSvRecord(TestCase): + def setUp(self): + self.maxDiff = None + + def testConstruction(self): + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=123, + chrom2="7", + end=456, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + counts=bg_db.GenotypeCounts(src_count=1, carriers=1, carriers_het=1), + ) + self.assertEqual( + str(record), + "SvRecord(release='GRCh37', sv_type='DEL', chrom='7', pos=123, chrom2='7', " + "end=456, orientation=, " + "counts=GenotypeCounts(src_count=1, carriers=1, carriers_het=1, carriers_hom=0, carriers_hemi=0))", + ) + + def testJaccardIndex(self): + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=1001, + chrom2="7", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + with self.assertRaises(ValueError): + record.jaccard_index(attrs.evolve(record, release="GRCh38")) + with self.assertRaises(ValueError): + record.jaccard_index(attrs.evolve(record, sv_type="DUP")) + self.assertAlmostEqual( + record.jaccard_index(attrs.evolve(record, chrom="8", chrom2="8")), 0.0 + ) + self.assertAlmostEqual(record.jaccard_index(record), 1.0, delta=1e-7) + self.assertAlmostEqual( + record.jaccard_index(attrs.evolve(record, pos=501, end=1500)), 0.333, delta=0.001 + ) + + def testIsCompatibleDel(self): + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=1001, + chrom2="7", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_good = attrs.evolve(record, pos=2000, end=2001) + record_bad = attrs.evolve(record, pos=2001, end=2001) + self.assertTrue(record.is_compatible(record_good, bnd_slack=50)) + self.assertFalse(record.is_compatible(record_bad, bnd_slack=50)) + + def testIsCompatibleBnd(self): + record = bg_db.SvRecord( + release="GRCh37", + sv_type="BND", + chrom="7", + pos=1000, + chrom2="9", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_good_1 = attrs.evolve(record, pos=1000, end=2050) + record_good_2 = attrs.evolve(record, pos=1050, end=2050) + record_bad_1 = attrs.evolve(record, orientation=bg_db.PairedEndOrientation.THREE_TO_THREE) + record_bad_2 = attrs.evolve(record, pos=1052) + record_bad_3 = attrs.evolve(record, end=2052) + self.assertTrue(record.is_compatible(record_good_1, bnd_slack=50)) + self.assertTrue(record.is_compatible(record_good_2, bnd_slack=50)) + self.assertFalse(record.is_compatible(record_bad_1, bnd_slack=50)) + self.assertFalse(record.is_compatible(record_bad_2, bnd_slack=50)) + self.assertFalse(record.is_compatible(record_bad_3, bnd_slack=50)) + + +class TestSvClusterWithDel(TestCase): + def setUp(self): + self.params = bg_db.ClusterAlgoParams(cluster_max_size=5, cluster_size_sample_to=3) + self.rng = random.Random(self.params.seed) + self.maxDiff = None + + def testConstruction(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + self.maxDiff = None + self.assertEqual( + str(cluster), + "SvCluster(params=ClusterAlgoParams(seed=42, cluster_max_size=5, cluster_size_sample_to=3, " + "min_jaccard_overlap=0.7, bnd_slack=50), mean=None, records=[], " + "counts=GenotypeCounts(src_count=0, carriers=0, carriers_het=0, carriers_hom=0, carriers_hemi=0))", + ) + + def testAugmentOnce(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=1001, + chrom2="7", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + counts=bg_db.GenotypeCounts(src_count=1, carriers=1, carriers_het=1), + ) + cluster.augment(record) + self.assertEqual( + str(cluster.mean), + "SvRecord(release='GRCh37', sv_type='DEL', chrom='7', pos=1001, " + "chrom2='7', end=2000, orientation=, " + "counts=GenotypeCounts(src_count=1, carriers=1, carriers_het=1, carriers_hom=0, carriers_hemi=0))", + ) + self.assertEqual(len(cluster.records), 1) + + def testAugmentTwice(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=1001, + chrom2="7", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + counts=bg_db.GenotypeCounts(src_count=1, carriers=1, carriers_het=1), + ) + cluster.augment(record) + cluster.augment(attrs.evolve(record, pos=1501, end=2500)) + self.assertEqual( + str(cluster.mean), + "SvRecord(release='GRCh37', sv_type='DEL', chrom='7', pos=1251, " + "chrom2='7', end=2250, orientation=, " + "counts=GenotypeCounts(src_count=1, carriers=1, carriers_het=1, carriers_hom=0, carriers_hemi=0))", + ) + self.assertEqual( + cluster.counts, bg_db.GenotypeCounts(src_count=2, carriers=2, carriers_het=2) + ) + self.assertEqual(len(cluster.records), 2) + + +class TestSvClusterWithBnd(TestCase): + def setUp(self): + self.params = bg_db.ClusterAlgoParams(cluster_max_size=5, cluster_size_sample_to=3) + self.rng = random.Random(self.params.seed) + self.maxDiff = None + + def testAugmentTwiceCompatible(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + record = bg_db.SvRecord( + release="GRCh37", + sv_type="BND", + chrom="7", + pos=1001, + chrom2="9", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + counts=bg_db.GenotypeCounts(src_count=1, carriers=1, carriers_het=1), + ) + cluster.augment(record) + cluster.augment(attrs.evolve(record, pos=record.pos + 50, end=record.end + 50)) + self.assertEqual( + str(cluster.mean), + "SvRecord(release='GRCh37', sv_type='BND', chrom='7', pos=1026, " + "chrom2='9', end=2025, orientation=, " + "counts=GenotypeCounts(src_count=1, carriers=1, carriers_het=1, carriers_hom=0, carriers_hemi=0))", + ) + self.assertEqual(len(cluster.records), 2) + self.assertEqual( + str(cluster.counts), + "GenotypeCounts(src_count=2, carriers=2, carriers_het=2, carriers_hom=0, carriers_hemi=0)", + ) + + def testAugmentTwinceIncompatible(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + record = bg_db.SvRecord( + release="GRCh37", + sv_type="BND", + chrom="7", + pos=1001, + chrom2="9", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_THREE, + ) + cluster.augment(record) + with self.assertRaises(ValueError): + cluster.augment(attrs.evolve(record, pos=record.pos + 51, end=record.end + 51)) + self.assertEqual(len(cluster.records), 1) + + +class TestClusterSvAlgorithm(TestCase): + def setUp(self): + self.maxDiff = None + + def testConstruction(self): + _algo = bg_db.ClusterSvAlgorithm(bg_db.ClusterAlgoParams()) + + def _run_clustering( + self, by_chrom: typing.Dict[str, typing.List[bg_db.SvRecord]] + ) -> typing.Dict[str, typing.List[bg_db.SvCluster]]: + result = {} + algo = bg_db.ClusterSvAlgorithm(bg_db.ClusterAlgoParams()) + for chrom, chrom_records in by_chrom.items(): + with algo.on_chrom(chrom): + for record in chrom_records: + algo.push(record) + result[chrom] = list(map(bg_db.SvCluster.normalized, algo.cluster())) + return algo, result + + def testWithNoVariant(self): + records = {} + _algo, clusters = self._run_clustering(records) + self.assertEqual(clusters, {}) + + def testWithSingleChromSingleVar(self): + record_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1000, + chrom2="chr1", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + records = { + "chr1": [record_1], + } + algo, clusters = self._run_clustering(records) + expected = { + "chr1": [ + bg_db.SvCluster( + params=algo.params, rng=algo.rng, mean=record_1, records=[record_1], + ) + ] + } + self.assertEqual(clusters, expected) + + def testWithSingleChromMultipleVariants(self): + record_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1000, + chrom2="chr1", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_2 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1500, + chrom2="chr1", + end=2500, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + mean_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1250, + chrom2="chr1", + end=2250, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + records = {"chr1": [record_1, record_2,]} + algo, clusters = self._run_clustering(records) + expected = { + "chr1": [ + bg_db.SvCluster( + params=algo.params, rng=algo.rng, mean=mean_1, records=[record_1, record_2] + ) + ], + } + self.assertEqual(clusters, expected) + + def testWithTwoChromsSingleVar(self): + record_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1000, + chrom2="chr1", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_2 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr2", + pos=1000, + chrom2="chr2", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + records = { + "chr1": [record_1,], + "chr2": [record_2,], + } + algo, clusters = self._run_clustering(records) + expected = { + "chr1": [ + bg_db.SvCluster(params=algo.params, rng=algo.rng, mean=record_1, records=[record_1]) + ], + "chr2": [ + bg_db.SvCluster(params=algo.params, rng=algo.rng, mean=record_2, records=[record_2]) + ], + } + self.assertEqual(clusters, expected) + + def testWithTwoChromsMultipleVariants(self): + record_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1000, + chrom2="chr1", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_2 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1500, + chrom2="chr1", + end=2500, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_3 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr2", + pos=1000, + chrom2="chr2", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_4 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr2", + pos=1500, + chrom2="chr2", + end=2500, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + mean_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr1", + pos=1250, + chrom2="chr1", + end=2250, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + mean_2 = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="chr2", + pos=1250, + chrom2="chr2", + end=2250, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + records = { + "chr1": [record_1, record_2,], + "chr2": [record_3, record_4,], + } + algo, clusters = self._run_clustering(records) + expected = { + "chr1": [ + bg_db.SvCluster( + params=algo.params, rng=algo.rng, mean=mean_1, records=[record_1, record_2] + ) + ], + "chr2": [ + bg_db.SvCluster( + params=algo.params, rng=algo.rng, mean=mean_2, records=[record_3, record_4] + ) + ], + } + self.assertEqual(clusters, expected) + + def testWithBreakend(self): + record_1 = bg_db.SvRecord( + release="GRCh37", + sv_type="BND", + chrom="chr1", + pos=1000, + chrom2="chr2", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + record_2 = attrs.evolve( + record_1, pos=1001, orientation=bg_db.PairedEndOrientation.THREE_TO_THREE, + ) + record_3 = attrs.evolve(record_1, end=record_1.end + 1) + mean_1 = attrs.evolve(record_1, end=record_1.end + 1) + mean_2 = record_2 + records = {"chr1": [record_1, record_2, record_3]} + algo, clusters = self._run_clustering(records) + expected = { + "chr1": [ + bg_db.SvCluster( + params=algo.params, rng=algo.rng, mean=mean_1, records=[record_1, record_3] + ), + bg_db.SvCluster(params=algo.params, rng=algo.rng, mean=mean_2, records=[record_2]), + ], + } + self.assertEqual(clusters, expected) + + +class TestModelToAttrs(TestCase): + def testWithDel(self): + sv_model = StructuralVariantFactory(chromosome="3", chromosome_no=3, start=1000, end=2000,) + sv_record = bg_db.sv_model_to_attrs(sv_model) + self.maxDiff = None + expected = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="3", + pos=1000, + chrom2="3", + end=2000, + orientation="3to5", + counts=bg_db.GenotypeCounts( + src_count=1, carriers=1, carriers_het=1, carriers_hom=0, carriers_hemi=0 + ), + ) + self.assertEqual(expected, sv_record) + + +class TestSvClusterToModelArgs(TestCase): + def setUp(self): + self.params = bg_db.ClusterAlgoParams(cluster_max_size=5, cluster_size_sample_to=3) + self.rng = random.Random(self.params.seed) + self.maxDiff = None + + def testWithDel(self): + cluster = bg_db.SvCluster(params=self.params, rng=self.rng) + record = bg_db.SvRecord( + release="GRCh37", + sv_type="DEL", + chrom="7", + pos=1001, + chrom2="7", + end=2000, + orientation=bg_db.PairedEndOrientation.THREE_TO_FIVE, + ) + cluster.augment(record) + expected = { + "bin": 585, + "carriers": 0, + "carriers_hemi": 0, + "carriers_het": 0, + "carriers_hom": 0, + "chromosome": "7", + "chromosome2": "7", + "chromosome_no": 7, + "chromosome_no2": 7, + "end": 2000, + "pe_orientation": "3to5", + "release": "GRCh37", + "src_count": 0, + "start": 1001, + "sv_type": "DEL", + } + self.assertEqual(expected, bg_db.sv_cluster_to_model_args(cluster)) + + +class TestBuildBgSvSet(TestCase): + def setUp(self): + super().setUp() + self.root_user = User.objects.create(username="root", is_superuser=True) + self.bg_job = BackgroundJob.objects.create( + name="job name", + project=None, + job_type="variants.export_file_bg_job", + user=self.root_user, + ) + self.build_sv_set_bg_job = BuildBackgroundSvSetJob.objects.create(bg_job=self.bg_job,) + + def testWithNoRecords(self): + self.assertEqual(BackgroundSvSet.objects.count(), 0) + self.assertEqual(BackgroundSv.objects.count(), 0) + bg_db.build_bg_sv_set(self.build_sv_set_bg_job) + self.assertEqual(BackgroundSvSet.objects.count(), 1) + self.assertEqual(BackgroundSv.objects.count(), 0) + + def testWithTwoChromsSingleRecords(self): + _vars = [ + StructuralVariantFactory(chromosome="1", chromosome_no=1, start=10_000, end=20_000), + StructuralVariantFactory(chromosome="2", chromosome_no=2, start=10_000, end=20_000), + ] + self.assertEqual(BackgroundSvSet.objects.count(), 0) + self.assertEqual(BackgroundSv.objects.count(), 0) + bg_db.build_bg_sv_set(self.build_sv_set_bg_job) + self.assertEqual(BackgroundSvSet.objects.count(), 1) + self.assertEqual(BackgroundSv.objects.count(), 2) + + def testWithTwoChromsTwoRecordsEach(self): + _vars = [ + StructuralVariantFactory(chromosome="1", chromosome_no=1, start=10_000, end=20_000), + StructuralVariantFactory(chromosome="2", chromosome_no=2, start=10_000, end=20_000), + StructuralVariantFactory(chromosome="1", chromosome_no=1, start=10_001, end=20_001), + StructuralVariantFactory(chromosome="2", chromosome_no=2, start=10_001, end=20_001), + ] + self.assertEqual(BackgroundSvSet.objects.count(), 0) + self.assertEqual(BackgroundSv.objects.count(), 0) + bg_db.build_bg_sv_set(self.build_sv_set_bg_job) + self.assertEqual(BackgroundSvSet.objects.count(), 1) + self.assertEqual(BackgroundSv.objects.count(), 2) diff --git a/svs/tests/test_models.py b/svs/tests/test_models.py index 6fc83bb59..c28cbb12a 100644 --- a/svs/tests/test_models.py +++ b/svs/tests/test_models.py @@ -11,6 +11,8 @@ StructuralVariantFlags, StructuralVariantSet, cleanup_variant_sets, + BackgroundSv, + BackgroundSvSet, ) from .factories import ( StructuralVariantFactory, @@ -18,6 +20,8 @@ StructuralVariantFlagsFactory, StructuralVariantCommentFactory, StructuralVariantSetFactory, + BackgroundSvFactory, + BackgroundSvSetFactory, ) @@ -66,3 +70,17 @@ def test(self): self.assertEqual(StructuralVariantSet.objects.count(), 0) self.assertEqual(StructuralVariant.objects.count(), 0) self.assertEqual(StructuralVariantGeneAnnotation.objects.count(), 0) + + +class TestBackgroundSv(TestCase): + def testConstruction(self): + _variant = BackgroundSvFactory() + self.assertEqual(BackgroundSvSet.objects.count(), 1) + self.assertEqual(BackgroundSv.objects.count(), 1) + + +class TestBackgroundSvSet(TestCase): + def testConstruction(self): + _variant_set = BackgroundSvSetFactory() + self.assertEqual(BackgroundSv.objects.count(), 0) + self.assertEqual(BackgroundSvSet.objects.count(), 1) diff --git a/svs/tests/test_queries.py b/svs/tests/test_queries.py index e62db2dec..0176671ee 100644 --- a/svs/tests/test_queries.py +++ b/svs/tests/test_queries.py @@ -15,6 +15,7 @@ StructuralVariantFactory, StructuralVariantGeneAnnotationFactory, StructuralVariantSetFactory, + BackgroundSvFactory, ) from .helpers import QueryTestBase from ..models import SV_TYPE_CHOICES, SV_SUB_TYPE_CHOICES, StructuralVariant @@ -418,8 +419,8 @@ def testQueryVariantSubTypeNoMatch(self): self.run_query(SingleCaseFilterQuery, {"sv_type": [], "sv_sub_type": sv_sub_types}, 0) -class SvCohortFrequencyFilterQueryTest(QueryTestBase): - """Test for filtration with the frequencies within the analysis cohort/collective.""" +class InHouseDatabaseFrequencyFilterQueryTest(QueryTestBase): + """Test for filtration with in-house background SV database frequencies.""" def setUp(self): super().setUp() @@ -427,120 +428,97 @@ def setUp(self): case__structure="trio", case__inheritance="denovo" ) self.case = self.variant_set.case - self.svs = [StructuralVariantFactory(variant_set=self.variant_set)] - self.genotype = self.svs[0].genotype + self.sv = StructuralVariantFactory(variant_set=self.variant_set) + BackgroundSvFactory( + release=self.sv.release, + chromosome=self.sv.chromosome, + start=int(self.sv.start + (self.sv.end - self.sv.start) * 0.20), + end=int(self.sv.end + (self.sv.end - self.sv.start) * 0.20), + ) - def testPassMinAffectedCarriers(self): - count = self.svs[0].info["affectedCarriers"] + def testPassesFrequencyFilterBelowThreshold(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_affected_carriers_min": count}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 1}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) - - def testFailMinAffectedCarriers(self): - count = self.svs[0].info["affectedCarriers"] - self.run_query( - SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_affected_carriers_min": count + 1}, - 0, - ) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) - def testPassMaxAffectedCarriers(self): - count = self.svs[0].info["affectedCarriers"] + def testPassesFrequencyFilterNoOverlap(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_affected_carriers_max": count}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.99, "inhouse_max_carriers": 0}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) - def testFailMaxAffectedCarriers(self): - count = self.svs[0].info["affectedCarriers"] + def testFailsFrequencyFilter(self): self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_affected_carriers_max": count - 1}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 0}, 0, ) - def testPassMinUnaffectedCarriers(self): - count = self.svs[0].info["unaffectedCarriers"] + def testPassesFrequencyFilterIfDisabled(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_unaffected_carriers_min": count}, + {"inhouse_enabled": False, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 0}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) - def testFailMinUnaffectedCarriers(self): - count = self.svs[0].info["unaffectedCarriers"] - self.run_query( - SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_unaffected_carriers_min": count + 1}, - 0, + +class SvGnomadDatabaseFrequencyFilterQueryTest(QueryTestBase): + """Test for filtration with gnomAD database frequencies.""" + + def setUp(self): + super().setUp() + self.variant_set = StructuralVariantSetFactory( + case__structure="trio", case__inheritance="denovo" + ) + self.case = self.variant_set.case + self.sv = StructuralVariantFactory(variant_set=self.variant_set) + GnomAdSvFactory( + release=self.sv.release, + chromosome=self.sv.chromosome, + start=self.sv.start + (self.sv.end - self.sv.start) * 0.20, + end=self.sv.end + (self.sv.end - self.sv.start) * 0.20, ) - def testPassMaxUnaffectedCarriers(self): - count = self.svs[0].info["unaffectedCarriers"] + def testPassesFrequencyFilterBelowThreshold(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_unaffected_carriers_max": count}, + {"gnomad_enabled": True, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 1}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) - - def testFailMaxUnaffectedCarriers(self): - count = self.svs[0].info["unaffectedCarriers"] - self.run_query( - SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_unaffected_carriers_max": count - 1}, - 0, - ) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) - def testPassMinBackgroundCarriers(self): - count = self.svs[0].info["backgroundCarriers"] + def testPassesFrequencyFilterNoOverlap(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_background_carriers_min": count}, + {"gnomad_enabled": True, "gnomad_min_overlap": 0.99, "gnomad_max_carriers": 0}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) - def testFailMinBackgroundCarriers(self): - count = self.svs[0].info["backgroundCarriers"] + def testFailsFrequencyFilter(self): self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_background_carriers_min": count + 1}, + {"gnomad_enabled": True, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 0}, 0, ) - def testPassMaxBackgroundCarriers(self): - count = self.svs[0].info["backgroundCarriers"] + def testPassesFrequencyFilterIfDisabled(self): result = self.run_query( SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_background_carriers_max": count}, + {"gnomad_enabled": False, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 0}, 1, ) - result = list(result) - self.assertUUIDEquals(self.svs[0].sv_uuid, result[0]["sv_uuid"]) - - def testFailMaxBackgroundCarriers(self): - count = self.svs[0].info["backgroundCarriers"] - self.run_query( - SingleCaseFilterQuery, - {"collective_enabled": True, "cohort_background_carriers_max": count - 1}, - 0, - ) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) -class SvDatabaseFrequencyFilterQueryTest(QueryTestBase): - """Test for filtration with database frequencies.""" +class SvInhouseDatabaseFrequencyFilterQueryTest(QueryTestBase): + """Test for filtration with in-house database frequencies.""" def setUp(self): super().setUp() @@ -549,17 +527,17 @@ def setUp(self): ) self.case = self.variant_set.case self.sv = StructuralVariantFactory(variant_set=self.variant_set) - GnomAdSvFactory( + self.bg_sv = BackgroundSvFactory( release=self.sv.release, chromosome=self.sv.chromosome, - start=self.sv.start + (self.sv.end - self.sv.start) * 0.20, - end=self.sv.end + (self.sv.end - self.sv.start) * 0.20, + start=int(self.sv.start + (self.sv.end - self.sv.start) * 0.20), + end=int(self.sv.end + (self.sv.end - self.sv.start) * 0.20), ) def testPassesFrequencyFilterBelowThreshold(self): result = self.run_query( SingleCaseFilterQuery, - {"gnomad_enabled": True, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 1}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 1}, 1, ) self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) @@ -567,7 +545,7 @@ def testPassesFrequencyFilterBelowThreshold(self): def testPassesFrequencyFilterNoOverlap(self): result = self.run_query( SingleCaseFilterQuery, - {"gnomad_enabled": True, "gnomad_min_overlap": 0.99, "gnomad_max_carriers": 0}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.99, "inhouse_max_carriers": 0}, 1, ) self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) @@ -575,21 +553,21 @@ def testPassesFrequencyFilterNoOverlap(self): def testFailsFrequencyFilter(self): self.run_query( SingleCaseFilterQuery, - {"gnomad_enabled": True, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 0}, + {"inhouse_enabled": True, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 0}, 0, ) def testPassesFrequencyFilterIfDisabled(self): result = self.run_query( SingleCaseFilterQuery, - {"gnomad_enabled": False, "gnomad_min_overlap": 0.50, "gnomad_max_carriers": 0}, + {"inhouse_enabled": False, "inhouse_min_overlap": 0.50, "inhouse_max_carriers": 0}, 1, ) self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) -class SvDatabaseFrequencyAnnotationTest(QueryTestBase): - """Test for annotation with database entries.""" +class SvGnomadDatabaseFrequencyAnnotationTest(QueryTestBase): + """Test for annotation with gnomAD database entries.""" # TODO: tests for other databases as well? @@ -618,6 +596,34 @@ def testGnomadAnnotationWithoutOverlap(self): self.assertEquals(result[0].gnomad_overlap_count, 0) +class SvInhouseDatabaseFrequencyAnnotationTest(QueryTestBase): + """Test for annotation with in-house database entries.""" + + def setUp(self): + super().setUp() + self.variant_set = StructuralVariantSetFactory( + case__structure="trio", case__inheritance="denovo" + ) + self.case = self.variant_set.case + self.sv = StructuralVariantFactory(variant_set=self.variant_set) + self.bg_sv = BackgroundSvFactory( + release=self.sv.release, + chromosome=self.sv.chromosome, + start=int(self.sv.start + (self.sv.end - self.sv.start) * 0.20), + end=int(self.sv.end + (self.sv.end - self.sv.start) * 0.20), + ) + + def testInHouseAnnotationWithOverlap(self): + result = self.run_query(SingleCaseFilterQuery, {"inhouse_min_overlap": 0.50}, 1) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) + self.assertEquals(result[0].inhouse_overlap_count, 1) + + def testInHouseAnnotationWithoutOverlap(self): + result = self.run_query(SingleCaseFilterQuery, {"inhouse_min_overlap": 0.99}, 1) + self.assertUUIDEquals(result[0].sv_uuid, self.sv.sv_uuid) + self.assertEquals(result[0].inhouse_overlap_count, 0) + + class RegionFilterQueryTest(QueryTestBase): def setUp(self): super().setUp() diff --git a/svs/urls.py b/svs/urls.py index 722eb8fa1..6d072225a 100644 --- a/svs/urls.py +++ b/svs/urls.py @@ -44,4 +44,15 @@ view=views.ImportStructuralVariantsJobDetailView.as_view(), name="import-job-detail", ), + # Views related to background SV jobs. + url( + regex=r"^build-bg-sv/(?P[0-9a-f-]+)/$", + view=views.BuildBackgroundSvSetJobDetailView.as_view(), + name="build-bg-sv-set-job-detail", + ), + url( + regex=r"^cleanup-bg-sv/(?P[0-9a-f-]+)/$", + view=views.CleanupBackgroundSvSetJobDetailView.as_view(), + name="cleanup-bg-sv-set-job-detail", + ), ] diff --git a/svs/views.py b/svs/views.py index 1768277f5..d9035fb6c 100644 --- a/svs/views.py +++ b/svs/views.py @@ -37,6 +37,8 @@ StructuralVariantGeneAnnotation, ImportStructuralVariantBgJob, StructuralVariantSet, + BuildBackgroundSvSetJob, + CleanupBackgroundSvSetJob, ) from .queries import SingleCaseFilterQuery, best_matching_flags from variants.models import Case @@ -90,6 +92,7 @@ def form_valid(self, form): query = SingleCaseFilterQuery(self.get_case_object(), get_engine()) args = dict(form.cleaned_data) # TODO: variant types + print("XXX\n\n", args, "\n\nXXX") with contextlib.closing(query.run(args)) as results: context_data = self._fetch_context_data(form, results) context_data["elapsed_seconds"] = timezone.now() - before @@ -100,6 +103,7 @@ def _fetch_context_data(self, form, results): rows_by_sv = {} seen = set() gene_id_to_symbol = {} + num_results = results.rowcount for entry in results.fetchall(): key = (entry.sv_uuid, entry.gene_id) if key in seen: @@ -138,7 +142,10 @@ def _fetch_context_data(self, form, results): ) context_data = self.get_context_data() - context_data["rows_by_sv"] = rows_by_sv + context_data["num_results"] = num_results + results_limit = form.cleaned_data.get("result_rows_limit", 0) + context_data["results_limit"] = results_limit + context_data["rows_by_sv"] = dict(list(rows_by_sv.items())[:results_limit]) context_data["database"] = form.cleaned_data["database_select"] context_data["card_colspan"] = 18 + len(self.get_case_object().pedigree) @@ -523,7 +530,7 @@ class ImportStructuralVariantsJobDetailView( ProjectContextMixin, DetailView, ): - """Display status and further details of the import case background job. + """Display status and further details of import case background jobs. """ permission_required = "variants.view_data" @@ -743,3 +750,29 @@ def _augment_context_data(self, context_data, form_data, results): context_data["card_colspan"] = 18 + len(self.get_case_object().pedigree) return context_data + + +class BuildBackgroundSvSetJobDetailView( + LoginRequiredMixin, LoggedInPermissionMixin, ProjectContextMixin, DetailView, +): + """Display status and further details of build sv set background jobs. + """ + + permission_required = "variants.view_data" + template_name = "svs/build_bg_job_detail.html" + model = BuildBackgroundSvSetJob + slug_url_kwarg = "job" + slug_field = "sodar_uuid" + + +class CleanupBackgroundSvSetJobDetailView( + LoginRequiredMixin, LoggedInPermissionMixin, ProjectContextMixin, DetailView, +): + """Display status and further details of cleanup sv set background jobs. + """ + + permission_required = "variants.view_data" + template_name = "svs/cleanup_bg_job_detail.html" + model = CleanupBackgroundSvSetJob + slug_url_kwarg = "job" + slug_field = "sodar_uuid" diff --git a/variants/models.py b/variants/models.py index 4e917aee3..f2697829a 100644 --- a/variants/models.py +++ b/variants/models.py @@ -84,6 +84,8 @@ CHROMOSOME_STR_TO_CHROMOSOME_INT = { b: a for a, b in enumerate(list(map(str, range(1, 23))) + ["X", "Y", "MT"], 1) } +#: List of chromosome names without "chr" prefix. +CHROMOSOME_NAMES = list(CHROMOSOME_STR_TO_CHROMOSOME_INT.keys()) def load_molecular_impact(kwargs):