Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: allow mehari to write out VarFish compatible TSV (#34) #35

Merged
merged 12 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
flamegraph.svg
perf.*

/target
#/Cargo.lock

Expand Down
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ clap-verbosity-flag = "2.0.0"
csv = "1.2.0"
flatbuffers = "23.1.21"
flate2 = "1.0.25"
hgvs = "0.5.0"
hgvs = "0.6.0"
lazy_static = "1.4.0"
log = "0.4.17"
noodles = { version = "0.34.0", features = ["vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] }
Expand All @@ -50,6 +50,7 @@ linked-hash-map = "0.5.6"
enumset = { version = "1.0.12", features = ["serde"] }
bincode = "1.3.3"
bgzip = "0.3.1"
rustc-hash = "1.1.0"

[build-dependencies]
flatc-rust = "0.2.0"
Expand Down
18 changes: 17 additions & 1 deletion docs/db_build.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ $ mehari db create txs \
You will have to build the transcript database for each genome release that you want and manually specify the release to `--genome-release`.
For GRCh38, simply use `--genome-release grch38`.

## Building ClinVar Database
# Building ClinVar Database

This assumes that you have converted a recent ClinVar XML file to TSV using [clinvar-tsv](https://github.com/bihealth/clinvar-tsv).

Expand All @@ -169,3 +169,19 @@ $ mehari db create seqvar-clinvar \
```

You can specify an optional `--genome-release grch37` argument that will be used to check the ClinVar database to be compatible with your data.

# Getting HGNC Cross-Link TSV File

You need to provide a TSV file that maps HGNC IDs to gene symbols, RefSeq, and Ensembl gene IDs.
You will need the [jq](https://stedolan.github.io/jq/) command line tool to generate this file.
You can use the provided `misc/genes-xlink-hgnc.jq` script to generate the file.

```
$ wget --no-check-certificate \
-O /tmp/hgnc_complete_set.json \
http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json
$ jq \
--raw-output \
--from-file misc/genes-xlink-hgnc.jq \
> ~/Data/mehari/db/hgnc.tsv
```
17 changes: 17 additions & 0 deletions misc/genes-xlink-hgnc.jq
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Create header
["hgnc_id", "ensembl_gene_id", "entrez_id", "gene_symbol"],
# Then, process the file
(
# For each entry in .response.docs
.response.docs[] |
# Generate array of ensembl_gene_id, entrez_id, and symbol, using
# the empty string for the default
[
.hgnc_id // "",
.ensembl_gene_id // "",
.entrez_id // "",
.symbol // ""
]
)
# Convert everything to TSV
| @tsv
113 changes: 107 additions & 6 deletions src/annotate/seqvars/ann.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,7 @@ pub enum Consequence {
FeatureElongation,
FeatureTruncation,
GeneVariant,
IntergenicVegion,
IntragenicVariant,
IntergenicVariant,
IntronVariant,
#[display("mature_miRNA_variant")]
MatureMirnaVariant,
Expand All @@ -80,7 +79,7 @@ pub enum Consequence {
#[display("NMD_transcript_variant")]
NmdTranscriptVariant,
NonCodingTranscriptExonVariant,
NonCodingTranscriptVariant,
NonCodingTranscriptIntronVariant,
RegulatoryRegionAmplification,
RegulatoryRegionVariant,
#[display("TF_binding_site_variant")]
Expand Down Expand Up @@ -130,14 +129,13 @@ impl From<Consequence> for PutativeImpact {
| Consequence::FeatureElongation
| Consequence::FeatureTruncation
| Consequence::GeneVariant
| Consequence::IntergenicVegion
| Consequence::IntragenicVariant
| Consequence::IntergenicVariant
| Consequence::IntronVariant
| Consequence::MatureMirnaVariant
| Consequence::Mirna
| Consequence::NmdTranscriptVariant
| Consequence::NonCodingTranscriptExonVariant
| Consequence::NonCodingTranscriptVariant
| Consequence::NonCodingTranscriptIntronVariant
| Consequence::RegulatoryRegionAmplification
| Consequence::RegulatoryRegionVariant
| Consequence::TfBindingSiteVariant
Expand Down Expand Up @@ -405,6 +403,98 @@ pub struct AnnField {
pub messages: Option<Vec<Message>>,
}

impl FromStr for AnnField {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut fields = s.split('|');
let allele = fields.next().unwrap().parse()?;
let consequences = fields
.next()
.unwrap()
.split('&')
.map(|s| s.parse())
.collect::<Result<Vec<_>, _>>()?;
let putative_impact = fields.next().unwrap().parse()?;
let gene_symbol = fields.next().unwrap().to_string();
let gene_id = fields.next().unwrap().to_string();
let feature_type = fields.next().unwrap().parse()?;
let feature_id = fields.next().unwrap().to_string();
let feature_biotype = fields.next().unwrap().parse()?;
let rank = fields.next().unwrap();
let rank = if rank.is_empty() {
None
} else {
Some(rank.parse()?)
};
let hgvs_t = fields.next().unwrap();
let hgvs_t = if hgvs_t.is_empty() {
None
} else {
Some(hgvs_t.to_string())
};
let hgvs_p = fields.next().unwrap();
let hgvs_p = if hgvs_p.is_empty() {
None
} else {
Some(hgvs_p.to_string())
};
let tx_pos = fields.next().unwrap();
let tx_pos = if tx_pos.is_empty() {
None
} else {
Some(tx_pos.parse()?)
};
let cds_pos = fields.next().unwrap();
let cds_pos = if cds_pos.is_empty() {
None
} else {
Some(cds_pos.parse()?)
};
let protein_pos = fields.next().unwrap();
let protein_pos = if protein_pos.is_empty() {
None
} else {
Some(protein_pos.parse()?)
};
let distance = fields.next().unwrap();
let distance = if distance.is_empty() {
None
} else {
Some(distance.parse()?)
};
let messages = fields.next().unwrap();
let messages = if messages.is_empty() {
None
} else {
Some(
messages
.split('&')
.map(|s| s.parse())
.collect::<Result<Vec<_>, _>>()?,
)
};
Ok(AnnField {
allele,
consequences,
putative_impact,
gene_symbol,
gene_id,
feature_type,
feature_id,
feature_biotype,
rank,
hgvs_t,
hgvs_p,
tx_pos,
cds_pos,
protein_pos,
distance,
messages,
})
}
}

impl std::fmt::Display for AnnField {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.allele)?;
Expand Down Expand Up @@ -866,4 +956,15 @@ mod test {
|HGVS.p|1|1/2|1|1|ERROR_CHROMOSOME_NOT_FOUND"
);
}

#[test]
fn ann_field_from_str() -> Result<(), anyhow::Error> {
let value = "A|missense_variant|MODERATE|GENE|HGNC:gene_id|transcript|feature_id|\
Coding|1/2|HGVS.c|HGVS.p|1|1/2|1|1|ERROR_CHROMOSOME_NOT_FOUND";

let field = AnnField::from_str(value)?;
assert_eq!(format!("{}", &field), value);

Ok(())
}
}
63 changes: 35 additions & 28 deletions src/annotate/seqvars/csq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,27 +77,27 @@ impl ConsequencePredictor {

pub fn predict(&self, var: &VcfVariant) -> Result<Option<Vec<AnnField>>, anyhow::Error> {
// Normalize variant by stripping common prefix and suffix.
let var = self.normalize_variant(var);
let norm_var = self.normalize_variant(var);

// Obtain accession from chromosome name.
let chrom_acc = self.chrom_to_acc.get(&var.chromosome);
let chrom_acc = self.chrom_to_acc.get(&norm_var.chromosome);
let chrom_acc = if let Some(chrom_acc) = chrom_acc {
chrom_acc
} else {
tracing::debug!(
"Could not determine chromosome accession for {:?}; giving up on annotation",
&var
&norm_var
);
return Ok(None);
};

// Get all affected transcripts.
let (var_start, var_end) = if var.reference.is_empty() {
(var.position - 1, var.position - 1)
let (var_start, var_end) = if norm_var.reference.is_empty() {
(norm_var.position - 1, norm_var.position - 1)
} else {
(
var.position - 1,
var.position + var.reference.len() as i32 - 1,
norm_var.position - 1,
norm_var.position + norm_var.reference.len() as i32 - 1,
)
};
let qry_start = var_start - PADDING;
Expand All @@ -112,7 +112,9 @@ impl ConsequencePredictor {
// Skip `None` results.
Ok(Some(
txs.into_iter()
.map(|tx| self.build_ann_field(&var, tx, chrom_acc.clone(), var_start, var_end))
.map(|tx| {
self.build_ann_field(&var, &norm_var, tx, chrom_acc.clone(), var_start, var_end)
})
.collect::<Result<Vec<_>, _>>()?
.into_iter()
.flatten()
Expand All @@ -122,6 +124,7 @@ impl ConsequencePredictor {

fn build_ann_field(
&self,
orig_var: &VcfVariant,
var: &VcfVariant,
tx_record: TxForRegionRecord,
chrom_acc: String,
Expand Down Expand Up @@ -291,12 +294,23 @@ impl ConsequencePredictor {
let min_start = min_start.expect("must have seen exon");
let max_end = max_end.expect("must have seen exon");

let feature_biotype = match tx.biotype {
TranscriptBiotype::Coding => FeatureBiotype::Coding,
TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding,
};

let is_upstream = var_end <= min_start;
let is_downstream = var_start >= max_end;
if is_exonic {
consequences.push(Consequence::ExonVariant);
if !feature_biotype.is_coding() {
consequences.push(Consequence::NonCodingTranscriptExonVariant);
}
} else if is_intronic {
consequences.push(Consequence::IntronVariant);
if !feature_biotype.is_coding() {
consequences.push(Consequence::NonCodingTranscriptIntronVariant);
} else {
consequences.push(Consequence::IntronVariant);
}
} else if is_upstream {
let val = -(min_start - var_end);
if val.abs() <= 5_000 {
Expand All @@ -317,11 +331,6 @@ impl ConsequencePredictor {
distance = Some(val);
}

let feature_biotype = match tx.biotype {
TranscriptBiotype::Coding => FeatureBiotype::Coding,
TranscriptBiotype::NonCoding => FeatureBiotype::Noncoding,
};

let var_g = HgvsVariant::GenomeVariant {
accession: Accession { value: chrom_acc },
gene_symbol: None,
Expand Down Expand Up @@ -415,7 +424,12 @@ impl ConsequencePredictor {
}
let var_p = var_p.unwrap();

let hgvs_p = Some(format!("{}", &var_p));
let hgvs_p = format!("{}", &var_p);
let mut hgvs_p = hgvs_p.split(':').nth(1).unwrap().to_owned();
for (aa3, aa1) in hgvs::sequences::AA3_TO_AA1_VEC.iter() {
hgvs_p = hgvs_p.replace(aa3, aa1);
}
let hgvs_p = Some(hgvs_p);
let cds_pos = match &var_c {
HgvsVariant::CdsVariant { loc_edit, .. } => Some(Pos {
ord: loc_edit.loc.inner().start.base,
Expand Down Expand Up @@ -465,13 +479,9 @@ impl ConsequencePredictor {
if start_cds_from == CdsFrom::Start {
if start_base < 0 {
consequences.push(Consequence::FivePrimeUtrVariant);
} else {
consequences.push(Consequence::CodingSequenceVariant);
}
} else if end_cds_from == CdsFrom::End {
consequences.push(Consequence::ThreePrimeUtrVariant);
} else {
consequences.push(Consequence::CodingSequenceVariant);
}

// The range is "conservative" (regarding deletions and insertions) if
Expand Down Expand Up @@ -573,6 +583,7 @@ impl ConsequencePredictor {
FeatureBiotype::Noncoding => (var_n, None, None, None, None),
};
let hgvs_t = format!("{}", &var_t);
let hgvs_t = hgvs_t.split(':').nth(1).unwrap().to_owned();

(
Some(rank),
Expand All @@ -597,7 +608,7 @@ impl ConsequencePredictor {
// Build and return ANN field from the information derived above.
Ok(Some(AnnField {
allele: Allele::Alt {
alternative: var.alternative.clone(),
alternative: orig_var.alternative.clone(),
},
consequences,
putative_impact,
Expand Down Expand Up @@ -688,11 +699,7 @@ mod test {
allele: Allele::Alt {
alternative: String::from("C")
},
consequences: vec![
Consequence::MissenseVariant,
Consequence::CodingSequenceVariant,
Consequence::ExonVariant
],
consequences: vec![Consequence::MissenseVariant,],
putative_impact: PutativeImpact::Moderate,
gene_symbol: String::from("BRCA1"),
gene_id: String::from("HGNC:1100"),
Expand All @@ -702,8 +709,8 @@ mod test {
feature_id: String::from("NM_007294.4"),
feature_biotype: FeatureBiotype::Coding,
rank: Some(Rank { ord: 23, total: 23 }),
hgvs_t: Some(String::from("NM_007294.4:c.5586C>G")),
hgvs_p: Some(String::from("NP_009225.1:p.His1862Gln")),
hgvs_t: Some(String::from("c.5586C>G")),
hgvs_p: Some(String::from("p.H1862Q")),
tx_pos: Some(Pos {
ord: 5699,
total: Some(7088)
Expand Down
Loading