From 5080c19814cc230ea6ec634905bce4676037e4df Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 5 Mar 2024 12:16:49 +0100 Subject: [PATCH 1/4] feat: enable using ENSEMBL chrMT transcript (#381) --- src/db/create/mod.rs | 124 +++++++++-- ...te__test__run_smoke_mitochondrial.snap.new | 194 ++++++++++++++++++ src/db/subset/mod.rs | 3 +- ...cdot-0.2.23.ensembl.chrMT.grch37.gff3.json | 3 + .../mitochondrial/latest/aliases.sqlite3 | 3 + .../2024/0305/1050/1709635806.4821885.fa.bgz | 3 + .../0305/1050/1709635806.4821885.fa.bgz.fai | 3 + .../0305/1050/1709635806.4821885.fa.bgz.gzi | 3 + .../mitochondrial/latest/sequences/db.sqlite3 | 3 + 9 files changed, 317 insertions(+), 22 deletions(-) create mode 100644 src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new create mode 100644 tests/data/db/create/mitochondrial/cdot-0.2.23.ensembl.chrMT.grch37.gff3.json create mode 100644 tests/data/db/create/mitochondrial/latest/aliases.sqlite3 create mode 100644 tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz create mode 100644 tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.fai create mode 100644 tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.gzi create mode 100644 tests/data/db/create/mitochondrial/latest/sequences/db.sqlite3 diff --git a/src/db/create/mod.rs b/src/db/create/mod.rs index 3ac0597f..c4237a40 100644 --- a/src/db/create/mod.rs +++ b/src/db/create/mod.rs @@ -23,6 +23,9 @@ lazy_static::lazy_static! { .unwrap(); } +/// Mitochondrial accessions. +const MITOCHONDRIAL_ACCESSIONS: &[&str] = &["NC_012920.1"]; + /// Command line arguments for `db create txs` sub command. #[derive(Parser, Debug)] #[command(about = "Construct mehari transcripts and sequence database", long_about = None)] @@ -98,7 +101,7 @@ fn load_and_extract( .collect::>(), ); } - tracing::info!("{:?}", txid_to_label); + tracing::trace!("labels = {:?}", txid_to_label); tracing::info!( "...done loading label TSV file ({} entries)", @@ -134,13 +137,18 @@ fn load_and_extract( start.elapsed() ); - // Count number of MANE Select and MANE Plus Clinical transcripts. + // Count number of MANE Select and MANE Plus Clinical transcripts, collect + // chrMT gene names. + let mut genes_chrmt = indexmap::IndexSet::new(); let mut n_mane_select = 0; let mut n_mane_plus_clinical = 0; for tx in c_txs.values() { let mut is_mane_select = false; let mut is_mane_plus_clinical = false; for gb in tx.genome_builds.values() { + if MITOCHONDRIAL_ACCESSIONS.contains(&gb.contig.as_str()) { + genes_chrmt.insert(tx.gene_version.clone()); + } if let Some(tag) = &gb.tag { if tag.contains(&models::Tag::ManeSelect) { is_mane_select = true; @@ -167,24 +175,29 @@ fn load_and_extract( n_mane_select, n_mane_plus_clinical ); + tracing::debug!("chrMT genes: {:?}", genes_chrmt); let start = Instant::now(); writeln!(report_file, "total_genes\t{}", c_genes.len())?; - c_genes - .values() - .filter(|gene| { - gene.hgnc.is_some() - && !gene.hgnc.as_ref().unwrap().is_empty() - && gene.map_location.is_some() - && !gene.map_location.as_ref().unwrap().is_empty() - && gene.hgnc.is_some() - && !gene.hgnc.as_ref().unwrap().is_empty() - }) - .for_each(|gene| { + for (gene_id, gene) in c_genes.iter() { + if gene.hgnc.is_none() || gene.hgnc.as_ref().unwrap().is_empty() { + writeln!(report_file, "skip because of missing HGNC id\t{}", gene_id)?; + tracing::debug!("skip because of missing HGNC id: {}", gene_id); + } else if !genes_chrmt.contains(gene_id) + && (gene.map_location.is_none() || gene.map_location.as_ref().unwrap().is_empty()) + { + writeln!( + report_file, + "skip because not chrMT and missing map_location\t{:?}", + gene + )?; + tracing::debug!("skip because of missing map_location\t{:?}", gene); + } else { let hgnc_id = format!("HGNC:{}", gene.hgnc.as_ref().unwrap()); transcript_ids_for_gene.entry(hgnc_id.clone()).or_default(); genes.insert(hgnc_id, gene.clone()); - }); + } + } writeln!( report_file, "genes with gene_symbol, map_location, hgnc\t{}", @@ -227,10 +240,28 @@ fn load_and_extract( ..tx.clone() }) .filter(|tx| { - tx.hgnc.is_some() - && !tx.hgnc.as_ref().unwrap().is_empty() - && genes.contains_key(&format!("HGNC:{}", tx.hgnc.as_ref().unwrap())) - && !tx.genome_builds.is_empty() + if tx.hgnc.is_none() || tx.hgnc.as_ref().unwrap().is_empty() { + writeln!(report_file, "skip because of missing HGNC id\t{:?}", tx.id) + .expect("problem writing report file"); + tracing::debug!("skip because of missing HGNC id:{:?}", tx.id); + false + } else if !genes.contains_key(&format!("HGNC:{}", tx.hgnc.as_ref().unwrap())) { + writeln!(report_file, "skip because gene not selected\t{:?}", tx.id) + .expect("problem writing report file"); + tracing::debug!("skip because gene not selected:{:?}", tx.id); + false + } else if tx.genome_builds.is_empty() { + writeln!( + report_file, + "skip because of empty genome builds\t{:?}", + tx.id + ) + .expect("problem writing report file"); + tracing::debug!("skip because of empty genome builds:{:?}", tx.id); + false + } else { + true + } }) .for_each(|tx| { let hgnc_id = &format!("HGNC:{}", tx.hgnc.as_ref().unwrap()); @@ -304,8 +335,8 @@ fn build_protobuf( pb.set_style(PROGRESS_STYLE.clone()); for (tx_id, tx) in &transcripts { pb.inc(1); - let namespace = if tx_id.starts_with("ENST") { - Some(String::from("ENSEMBL")) + let namespace: Option = if tx_id.starts_with("ENST") { + Some(String::from("Ensembl")) } else { Some(String::from("NCBI")) }; @@ -741,6 +772,10 @@ fn filter_transcripts( "skipped transcript {} because we have a later version already", &full_ac )?; + tracing::debug!( + "skipping transcript {} because we have a later version already", + &full_ac + ); continue; // skip, already have later version } else if ac.starts_with("NR_") && seen_nm { writeln!( @@ -748,6 +783,10 @@ fn filter_transcripts( "skipped transcript {} because we have a NM transcript", &full_ac )?; + tracing::debug!( + "skipping transcript {} because we have a NM transcript", + &full_ac + ); continue; // skip NR transcript as we have NM one } else if ac.starts_with('X') { writeln!( @@ -755,9 +794,17 @@ fn filter_transcripts( "skipped transcript {} because it is an XR/XM transcript", &full_ac )?; + tracing::debug!( + "skipping transcript {} because it is an XR/XM transcript", + &full_ac + ); continue; // skip XR/XM transcript } else { - // Check transcript's CDS length for being multiple of 3 and skip unless it is. + // Check transcript's CDS length for being multiple of 3 and skip unless + // it is. + // + // Note that the chrMT transcripts have been fixed earlier already to + // accomodate for how they are fixed by poly-A tailing. let tx = transcripts .get(&full_ac) .expect("must exist; accession taken from map earlier"); @@ -1050,4 +1097,39 @@ pub mod test { Ok(()) } + + #[tracing_test::traced_test] + #[test] + fn run_smoke_mitochondrial() -> Result<(), anyhow::Error> { + let tmp_dir = TempDir::default(); + + let common_args = CommonArgs { + verbose: Verbosity::new(5, 0), + }; + let args = Args { + path_out: tmp_dir.join("out.bin.zst"), + path_cdot_json: vec![PathBuf::from( + "tests/data/db/create/mitochondrial/cdot-0.2.23.ensembl.chrMT.grch37.gff3.json", + )], + path_mane_txs_tsv: None, + path_seqrepo_instance: PathBuf::from("tests/data/db/create/mitochondrial/latest"), + genome_release: GenomeRelease::Grch37, + max_txs: None, + gene_symbols: None, + }; + + run(&common_args, &args)?; + + let mut buf: Vec = Vec::new(); + dump::run_with_write( + &Default::default(), + &dump::Args { + path_db: tmp_dir.join("out.bin.zst"), + }, + &mut buf, + )?; + insta::assert_snapshot!(String::from_utf8(buf)?); + + Ok(()) + } } diff --git a/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new new file mode 100644 index 00000000..67b2becb --- /dev/null +++ b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new @@ -0,0 +1,194 @@ +--- +source: src/db/create/mod.rs +assertion_line: 1135 +expression: "String::from_utf8(buf)?" +--- +txDb: + transcripts: + - id: ENST00000361899.2 + geneSymbol: MT-ATP6 + geneId: HGNC:7414 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 681 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 8526 + cdsEnd: 9207 + strand: STRAND_PLUS + exons: + - altStartI: 8526 + altEndI: 9207 + altCdsStartI: 1 + altCdsEndI: 681 + cigar: 681M + - id: ENST00000361851.1 + geneSymbol: MT-ATP8 + geneId: HGNC:7415 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 207 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 8365 + cdsEnd: 8572 + strand: STRAND_PLUS + exons: + - altStartI: 8365 + altEndI: 8572 + altCdsStartI: 1 + altCdsEndI: 207 + cigar: 207M + - id: ENST00000361739.1 + geneSymbol: MT-CO2 + geneId: HGNC:7421 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 684 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 7585 + cdsEnd: 8269 + strand: STRAND_PLUS + exons: + - altStartI: 7585 + altEndI: 8269 + altCdsStartI: 1 + altCdsEndI: 684 + cigar: 684M + - id: ENST00000361335.1 + geneSymbol: MT-ND4L + geneId: HGNC:7460 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 297 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 10469 + cdsEnd: 10766 + strand: STRAND_PLUS + exons: + - altStartI: 10469 + altEndI: 10766 + altCdsStartI: 1 + altCdsEndI: 297 + cigar: 297M + - id: ENST00000361567.2 + geneSymbol: MT-ND5 + geneId: HGNC:7461 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 1812 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 12336 + cdsEnd: 14148 + strand: STRAND_PLUS + exons: + - altStartI: 12336 + altEndI: 14148 + altCdsStartI: 1 + altCdsEndI: 1812 + cigar: 1812M + - id: ENST00000389680.2 + geneSymbol: MT-RNR1 + geneId: HGNC:7470 + biotype: TRANSCRIPT_BIOTYPE_NON_CODING + tags: + - TRANSCRIPT_TAG_BASIC + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + strand: STRAND_PLUS + exons: + - altStartI: 647 + altEndI: 1601 + altCdsStartI: 1 + altCdsEndI: 954 + cigar: 954M + - id: ENST00000387347.2 + geneSymbol: MT-RNR2 + geneId: HGNC:7471 + biotype: TRANSCRIPT_BIOTYPE_NON_CODING + tags: + - TRANSCRIPT_TAG_BASIC + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + strand: STRAND_PLUS + exons: + - altStartI: 1670 + altEndI: 3229 + altCdsStartI: 1 + altCdsEndI: 1559 + cigar: 1559M + geneToTx: + - geneId: HGNC:7462 + txIds: + - ENST00000361681.2 + - geneId: HGNC:7421 + txIds: + - ENST00000361739.1 + - geneId: HGNC:7461 + txIds: + - ENST00000361567.2 + - geneId: HGNC:7419 + txIds: + - ENST00000361624.2 + - geneId: HGNC:7414 + txIds: + - ENST00000361899.2 + - geneId: HGNC:7471 + txIds: + - ENST00000387347.2 + - geneId: HGNC:7470 + txIds: + - ENST00000389680.2 + - geneId: HGNC:7460 + txIds: + - ENST00000361335.1 + - geneId: HGNC:7415 + txIds: + - ENST00000361851.1 +seqDb: + aliases: + - ENST00000361335.1 + - ENST00000361567.2 + - ENST00000361739.1 + - ENST00000361851.1 + - ENST00000361899.2 + - ENST00000387347.2 + - ENST00000389680.2 + aliasesIdx: + - 0 + - 1 + - 2 + - 3 + - 4 + - 5 + - 6 + seqs: + - ATGCCCCTCATTTACATAAATATTATACTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAA + - ATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCCACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCTAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAA + - ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAG + - ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG + - ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTTAATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAACACATAA + - GCTAAACCTAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCTACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGAACAGGGTTT + - AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAAC +version: 0.0.0 +genomeRelease: GRCh37 diff --git a/src/db/subset/mod.rs b/src/db/subset/mod.rs index fc27d4ee..525d392e 100644 --- a/src/db/subset/mod.rs +++ b/src/db/subset/mod.rs @@ -171,6 +171,7 @@ pub fn run(_common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Err mod tests { use temp_testdir::TempDir; + #[tracing_test::traced_test] #[test] fn test_subset_tx_db() -> Result<(), anyhow::Error> { let temp = TempDir::default(); @@ -192,7 +193,7 @@ mod tests { &mut buf, )?; - insta::assert_display_snapshot!(String::from_utf8(buf)?); + insta::assert_snapshot!(String::from_utf8(buf)?); Ok(()) } diff --git a/tests/data/db/create/mitochondrial/cdot-0.2.23.ensembl.chrMT.grch37.gff3.json b/tests/data/db/create/mitochondrial/cdot-0.2.23.ensembl.chrMT.grch37.gff3.json new file mode 100644 index 00000000..5fe72925 --- /dev/null +++ b/tests/data/db/create/mitochondrial/cdot-0.2.23.ensembl.chrMT.grch37.gff3.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c0fd5b3e4f86163860db7e0db86e4825f39f8cb9ea417532f7b80c30de8ac72 +size 41936 diff --git a/tests/data/db/create/mitochondrial/latest/aliases.sqlite3 b/tests/data/db/create/mitochondrial/latest/aliases.sqlite3 new file mode 100644 index 00000000..da085da7 --- /dev/null +++ b/tests/data/db/create/mitochondrial/latest/aliases.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbba64ad4490ee8c89ccfc5c63df2a55b973bfe019f66e41f4ed4758d3718f4 +size 114688 diff --git a/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz new file mode 100644 index 00000000..294ca92c --- /dev/null +++ b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77ca87f55093274e29f9efc44d37408c0d3fd249c6d7495bb40ed1b43cf1fc48 +size 6267 diff --git a/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.fai b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.fai new file mode 100644 index 00000000..68387a59 --- /dev/null +++ b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.fai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c989fe353e4cfade25ebe6fb54ed1b1be6869c787c4ff7435f2a7b1338dd84bd +size 1813 diff --git a/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.gzi b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.gzi new file mode 100644 index 00000000..a2b71017 --- /dev/null +++ b/tests/data/db/create/mitochondrial/latest/sequences/2024/0305/1050/1709635806.4821885.fa.bgz.gzi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc +size 8 diff --git a/tests/data/db/create/mitochondrial/latest/sequences/db.sqlite3 b/tests/data/db/create/mitochondrial/latest/sequences/db.sqlite3 new file mode 100644 index 00000000..be591a4a --- /dev/null +++ b/tests/data/db/create/mitochondrial/latest/sequences/db.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfcbcfd6ea670e9e353f33f40b9898cb33dc6b42d759b154546625e06a4d3b2a +size 61440 From 05c2484c04fc7d612292f82fbe3ea3c2c7b55176 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 5 Mar 2024 12:35:33 +0100 Subject: [PATCH 2/4] wip --- src/db/create/mod.rs | 2 +- ...w => mehari__db__create__test__run_smoke_mitochondrial.snap} | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) rename src/db/create/snapshots/{mehari__db__create__test__run_smoke_mitochondrial.snap.new => mehari__db__create__test__run_smoke_mitochondrial.snap} (99%) diff --git a/src/db/create/mod.rs b/src/db/create/mod.rs index c4237a40..a4626d9a 100644 --- a/src/db/create/mod.rs +++ b/src/db/create/mod.rs @@ -300,7 +300,7 @@ fn load_and_extract( /// Perform protobuf file construction. /// -/// This can be done by simply converting the models from HGVS to the prost generated data structures. +/// This can be done by simply converting the models from ``hvs-rs`` to the prost generated data structures. fn build_protobuf( path_out: &Path, seqrepo: SeqRepo, diff --git a/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap similarity index 99% rename from src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new rename to src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap index 67b2becb..f14ca344 100644 --- a/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap.new +++ b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap @@ -1,6 +1,5 @@ --- source: src/db/create/mod.rs -assertion_line: 1135 expression: "String::from_utf8(buf)?" --- txDb: From cb7b3c9053023aba9389477e0bd29faaff56abeb Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 5 Mar 2024 15:47:47 +0100 Subject: [PATCH 3/4] wip --- src/db/create/mod.rs | 64 ++++++- ...create__test__run_smoke_mitochondrial.snap | 170 +++++++++++++++++- 2 files changed, 219 insertions(+), 15 deletions(-) diff --git a/src/db/create/mod.rs b/src/db/create/mod.rs index a4626d9a..bf162665 100644 --- a/src/db/create/mod.rs +++ b/src/db/create/mod.rs @@ -77,6 +77,7 @@ fn load_and_extract( genome_release: GenomeRelease, cdot_version: &mut String, report_file: &mut File, + mt_tx_ids: &mut indexmap::IndexSet, ) -> Result<(), anyhow::Error> { writeln!(report_file, "genome_release\t{:?}", genome_release)?; let txid_to_label = if let Some(label_tsv_path) = label_tsv_path { @@ -148,6 +149,7 @@ fn load_and_extract( for gb in tx.genome_builds.values() { if MITOCHONDRIAL_ACCESSIONS.contains(&gb.contig.as_str()) { genes_chrmt.insert(tx.gene_version.clone()); + mt_tx_ids.insert(tx.id.clone()); } if let Some(tag) = &gb.tag { if tag.contains(&models::Tag::ManeSelect) { @@ -269,7 +271,9 @@ fn load_and_extract( .get_mut(hgnc_id) .unwrap_or_else(|| panic!("tx {:?} for unknown gene {:?}", tx.id, hgnc_id)) .push(tx.id.clone()); + // build output transcripts let mut tx_out = tx.clone(); + // transfer MANE-related labels from TSV file if let Some(txid_to_tags) = txid_to_label.as_ref() { let tx_id_no_version = tx.id.split('.').next().unwrap(); if let Some(tags) = txid_to_tags.get(tx_id_no_version) { @@ -282,6 +286,30 @@ fn load_and_extract( }); } } + // fix coding mitochondrial transcripts that have a CDS that is not a multiple of 3 + if let Some(cds_start) = tx_out.start_codon { + let cds_end = tx_out + .stop_codon + .expect("must be some if start_codon is some"); + let cds_len = cds_end - cds_start; + if cds_len % 3 != 0 { + assert_eq!( + tx.genome_builds.len(), + 1, + "only one genome build expected at this point" + ); + let gb = tx_out.genome_builds.iter_mut().next().unwrap().1; + assert_eq!(gb.exons.len(), 1, "only single-exon genes assumed on chrMT"); + if MITOCHONDRIAL_ACCESSIONS.contains(&gb.contig.as_ref()) { + let delta = 3 - cds_len % 3; + tx_out.stop_codon = Some(cds_end + delta); + let exon = gb.exons.iter_mut().next().unwrap(); + exon.alt_cds_end_i += delta; + exon.cigar.push_str(&format!("{}I", delta)); + } + } + } + // finally, insert into transcripts transcripts.insert(tx.id.clone(), tx_out); }); writeln!( @@ -304,6 +332,7 @@ fn load_and_extract( fn build_protobuf( path_out: &Path, seqrepo: SeqRepo, + mt_tx_ids: indexmap::IndexSet, tx_data: TranscriptData, is_silent: bool, genome_release: GenomeRelease, @@ -345,7 +374,15 @@ fn build_protobuf( namespace, }); let seq = if let Ok(seq) = res_seq { - seq + // Append poly-A for chrMT transcripts (which are from ENSEMBL). + // This also potentially fixes the stop codon. + if mt_tx_ids.contains(tx_id) { + let mut seq = seq.into_bytes(); + seq.extend_from_slice(b"A".repeat(300).as_slice()); + String::from_utf8(seq).expect("must be valid UTF-8") + } else { + seq + } } else { tracing::debug!("Skipping transcript {} because of missing sequence", tx_id); writeln!( @@ -897,13 +934,17 @@ fn open_seqrepo(args: &Args) -> Result { } /// Load the cdot JSON files. -fn load_cdot_files(args: &Args, report_file: &mut File) -> Result { +fn load_cdot_files( + args: &Args, + report_file: &mut File, +) -> Result<(indexmap::IndexSet, TranscriptData), anyhow::Error> { tracing::info!("Loading cdot JSON files ..."); let start = Instant::now(); let mut genes = indexmap::IndexMap::new(); let mut transcripts = indexmap::IndexMap::new(); let mut transcript_ids_for_gene = indexmap::IndexMap::new(); let mut cdot_version = String::new(); + let mut mt_tx_ids = indexmap::IndexSet::new(); for json_path in &args.path_cdot_json { load_and_extract( json_path, @@ -914,6 +955,7 @@ fn load_cdot_files(args: &Args, report_file: &mut File) -> Result Result Result<(), anyhow::Erro // Open seqrepo, let seqrepo = open_seqrepo(args)?; // then load cdot files, - let tx_data = load_cdot_files(args, &mut report_file)?; + let (mt_tx_ids, tx_data) = load_cdot_files(args, &mut report_file)?; // then remove redundant onces, and let tx_data = filter_transcripts(tx_data, args.max_txs, &args.gene_symbols, &mut report_file)?; // finally build protobuf file. build_protobuf( &args.path_out, seqrepo, + mt_tx_ids, tx_data, common.verbose.is_silent(), args.genome_release, @@ -990,6 +1036,7 @@ pub mod test { let mut transcript_ids_for_gene = indexmap::IndexMap::new(); let mut cdot_version = String::new(); let path_tsv = Path::new("tests/data/db/create/txs/txs_main.tsv"); + let mut mt_tx_ids = indexmap::IndexSet::new(); load_and_extract( Path::new("tests/data/db/create/txs/cdot-0.2.22.refseq.grch37_grch38.brca1_opa1.json"), &Some(path_tsv), @@ -999,6 +1046,7 @@ pub mod test { GenomeRelease::Grch37, &mut cdot_version, &mut report_file, + &mut mt_tx_ids, )?; let tx_data = TranscriptData { diff --git a/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap index f14ca344..f7482d48 100644 --- a/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap +++ b/src/db/create/snapshots/mehari__db__create__test__run_smoke_mitochondrial.snap @@ -64,6 +64,126 @@ txDb: altCdsStartI: 1 altCdsEndI: 684 cigar: 684M + - id: ENST00000362079.2 + geneSymbol: MT-CO3 + geneId: HGNC:7422 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 786 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 9206 + cdsEnd: 9990 + strand: STRAND_PLUS + exons: + - altStartI: 9206 + altEndI: 9990 + altCdsStartI: 1 + altCdsEndI: 786 + cigar: 784M2I + - id: ENST00000361789.2 + geneSymbol: MT-CYB + geneId: HGNC:7427 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 1143 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 14746 + cdsEnd: 15887 + strand: STRAND_PLUS + exons: + - altStartI: 14746 + altEndI: 15887 + altCdsStartI: 1 + altCdsEndI: 1143 + cigar: 1141M2I + - id: ENST00000361390.2 + geneSymbol: MT-ND1 + geneId: HGNC:7455 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 957 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 3306 + cdsEnd: 4262 + strand: STRAND_PLUS + exons: + - altStartI: 3306 + altEndI: 4262 + altCdsStartI: 1 + altCdsEndI: 957 + cigar: 956M1I + - id: ENST00000361453.3 + geneSymbol: MT-ND2 + geneId: HGNC:7456 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 1044 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 4469 + cdsEnd: 5511 + strand: STRAND_PLUS + exons: + - altStartI: 4469 + altEndI: 5511 + altCdsStartI: 1 + altCdsEndI: 1044 + cigar: 1042M2I + - id: ENST00000361227.2 + geneSymbol: MT-ND3 + geneId: HGNC:7458 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 348 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 10058 + cdsEnd: 10404 + strand: STRAND_PLUS + exons: + - altStartI: 10058 + altEndI: 10404 + altCdsStartI: 1 + altCdsEndI: 348 + cigar: 346M2I + - id: ENST00000361381.2 + geneSymbol: MT-ND4 + geneId: HGNC:7459 + biotype: TRANSCRIPT_BIOTYPE_CODING + tags: + - TRANSCRIPT_TAG_BASIC + startCodon: 0 + stopCodon: 1380 + genomeAlignments: + - genomeBuild: GENOME_BUILD_GRCH37 + contig: NC_012920.1 + cdsStart: 10759 + cdsEnd: 12137 + strand: STRAND_PLUS + exons: + - altStartI: 10759 + altEndI: 12137 + altCdsStartI: 1 + altCdsEndI: 1380 + cigar: 1378M2I - id: ENST00000361335.1 geneSymbol: MT-ND4L geneId: HGNC:7460 @@ -143,15 +263,33 @@ txDb: - geneId: HGNC:7421 txIds: - ENST00000361739.1 + - geneId: HGNC:7427 + txIds: + - ENST00000361789.2 + - geneId: HGNC:7456 + txIds: + - ENST00000361453.3 - geneId: HGNC:7461 txIds: - ENST00000361567.2 - geneId: HGNC:7419 txIds: - ENST00000361624.2 + - geneId: HGNC:7458 + txIds: + - ENST00000361227.2 + - geneId: HGNC:7459 + txIds: + - ENST00000361381.2 + - geneId: HGNC:7455 + txIds: + - ENST00000361390.2 - geneId: HGNC:7414 txIds: - ENST00000361899.2 + - geneId: HGNC:7422 + txIds: + - ENST00000362079.2 - geneId: HGNC:7471 txIds: - ENST00000387347.2 @@ -166,11 +304,17 @@ txDb: - ENST00000361851.1 seqDb: aliases: + - ENST00000361227.2 - ENST00000361335.1 + - ENST00000361381.2 + - ENST00000361390.2 + - ENST00000361453.3 - ENST00000361567.2 - ENST00000361739.1 + - ENST00000361789.2 - ENST00000361851.1 - ENST00000361899.2 + - ENST00000362079.2 - ENST00000387347.2 - ENST00000389680.2 aliasesIdx: @@ -181,13 +325,25 @@ seqDb: - 4 - 5 - 6 + - 7 + - 8 + - 9 + - 10 + - 11 + - 12 seqs: - - ATGCCCCTCATTTACATAAATATTATACTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAA - - ATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCCACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCTAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAA - - ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAG - - ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAG - - ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTTAATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAACACATAA - - GCTAAACCTAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCTACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGAACAGGGTTT - - AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAAC + - ATAAACTTCGCCTTAATTTTAATAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACATAGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAATTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAGCCCTACAAACAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAGTCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGCCCCTCATTTACATAAATATTATACTAGCATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGAAGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAATATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAATCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGCTAAAACTAATCGTCCCAACAATTATATTACTACCACTGACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACAGCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCCAACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATCATGGCAAGCCAACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCTCCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACATACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCCTAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTTAATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCTAAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCTATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACTATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCATACTCTTCAATCAGCCACATAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTACGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTTTGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTGTGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGCCCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAACCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACCCCGACATCATTACCGGGTTTTCCTCTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCATTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCCCCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCACATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCCCTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGCAGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGCTCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGGCCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTCCGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATACACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCCCTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAACAGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATTAATCCCCTGGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTTTTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTCGTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTATCCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATAATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGGTTACCCAAGGCACCCCTCTGACATCCGGCCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCAAATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGAGGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAATAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTACCGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTTTGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCACCATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATATCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATCGCCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCCATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCATCCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAACTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATATTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACCCAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAACAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGATACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTTTCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAACGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGTCTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTATAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAACTCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAACCACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTCCATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTCACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGCTACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCCACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCCTATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTACCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACCAACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCATCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCTAACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCAACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAATCACATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCCTGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGAAACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTACATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACGAGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGACCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACATCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTCTAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGCAAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTTACCCTATAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGACCCCAATACGCAAAACTAACCCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAAACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTACTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATCATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCCTATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGCAACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAACTTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTACTCAGTAGACAGTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTTGCCCTTCATTATTGCAGCCCTAGCAACACTCCACCTCCTATTCTTGCACGAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATCACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACATTAACACTATTCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCCTCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTAGGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAACAACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCTAACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATACTTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTACCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGAACGAAAATCTGTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCTCTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAACTAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTTAATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTATCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTAAAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGAAACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCATCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCACACTTCTAGTAAGCCTCTACCTGCACGACAACACATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - ATGACCCACCAATCACATGCCTATCATATAGTAAAACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATTTCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGGCGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACGGGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAGCCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAAGTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAATAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCTCCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTTGTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCCAACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGTAGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - GCTAAACCTAGCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATAATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCCAAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAACATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAGTAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCCAATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTATGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATTAAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAGTCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCTACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGAACAGGGTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + - AATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGCAATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACTCACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACACACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATCAACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTATGAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA version: 0.0.0 genomeRelease: GRCh37 From 447ac8e7b21d8c43ce3ce8ed24e888b1fd9d814d Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 5 Mar 2024 16:19:36 +0100 Subject: [PATCH 4/4] wip --- src/annotate/seqvars/provider.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/annotate/seqvars/provider.rs b/src/annotate/seqvars/provider.rs index d03b0903..f77f4eb3 100644 --- a/src/annotate/seqvars/provider.rs +++ b/src/annotate/seqvars/provider.rs @@ -22,6 +22,12 @@ use crate::{ pbs::txs::{GeneToTxId, Strand, Transcript, TranscriptTag, TxSeqDatabase}, }; +/// Mitochondrial accessions. +const MITOCHONDRIAL_ACCESSIONS: &[&str] = &[ + "NC_012920.1", // rCRS + "NC_001807.4", // CRS +]; + type IntervalTree = ArrayBackedIntervalTree; pub struct TxIntervalTrees { @@ -583,6 +589,9 @@ impl ProviderInterface for Provider { .collect::>(); tmp.sort(); + let is_mitochondrial = MITOCHONDRIAL_ACCESSIONS + .contains(&tx.genome_alignments.first().unwrap().contig.as_str()); + let lengths = tmp.into_iter().map(|(_, length)| length).collect(); Ok(TxIdentityInfo { tx_ac: tx_ac.to_string(), @@ -592,7 +601,9 @@ impl ProviderInterface for Provider { cds_end_i: tx.stop_codon.unwrap_or_default(), lengths, hgnc, - translation_table: if is_selenoprotein { + translation_table: if is_mitochondrial { + TranslationTable::VertebrateMitochondrial + } else if is_selenoprotein { TranslationTable::Selenocysteine } else { TranslationTable::Standard