In [2]:
#|default_exp cnn_virus.data

In [3]:
#| hide
from __future__ import annotations
from ecutilities.ipython import nb_setup
from ecutilities.core import files_in_tree
from fastcore.test import test_fail
from nbdev import show_doc, nbdev_export
from pprint import pprint

In [4]:
#| hide
nb_setup()

# ON_COLAB, p2dataroot, p2data = setup_nb(_dev=True)

Set autoreload mode


In [5]:
#|export
import json
import numpy as np
import pandas as pd
import os
import re
import tensorflow as tf
import warnings

from ecutilities.core import validate_path
from functools import partial, partialmethod
from metagentools.bio import q_score2prob_error
from metagentools.core import TextFileBaseReader, ProjectFileSystem
from pathlib import Path
from tqdm.notebook import tqdm, trange
from typing import Any, Optional

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import tensorflow as tf
from tensorflow.io import serialize_tensor, FixedLenFeature
from tensorflow.keras.utils import Sequence, to_categorical
from tensorflow.data import TextLineDataset, TFRecordDataset

In [6]:
#| export
# Retrieve the package root
from metagentools import __file__
CODE_ROOT = Path(__file__).parents[0]
PACKAGE_ROOT = Path(__file__).parents[1]

In [7]:
#|hide
print(f"Tensorflow version: {tf.__version__} - Expected 2.8.2")
print(f"metagentools package location: {__file__}")

Tensorflow version: 2.8.2 - Expected 2.8.2
metagentools package location: /home/vtec/projects/bio/metagentools/metagentools/__init__.py


# data

> Data structure, data preprocessing and transform functions, data reader classes, datasets for CNN Virus data

# Data structure for CNN Virus project

There are many different types of files and datasets for this project. All data are located in directory `data`, under the project root. The following is an overview of the main types of data and in which directory they sit in the tree.

A description of the content of each directory is recorded in `readme.md` or another `*.md` file. 

These `readme.md` files can be conveniently accessed using the `.readme(path)` method on `ProjectFileSystem`.

In [8]:
ProjectFileSystem().readme()

ReadMe file for directory `data`:

### Data structure for `metagentools`
This directory includes all the data required for the project `metagentools`.

```text
data
 |--- CNN_Virus_data 
 |--- ncbi           
 |--- ncov_data      
 |--- saved         
 |--- ....           
     
```
#### Sub-directories
- `CNN_Virus_data`: includes all the data related to the original CNN Virus paper, i.e. training data and validation data in a format that can be used by the CNN Virus code.
- `ncbi`: includes data related to the use of CoV sequences from NCBI: reference sequences, simulated reads, inference datasets, inference results.
- `ncov_data`: includes data related to the use of non Cov sequences from various sources: reference sequences, simulated reads, inference datasets, inference results.
- `saved`: includes model saved parameters and preprocessing datasets.


## Original datasets

In [9]:
pfs = ProjectFileSystem()
pfs.readme(pfs.data/'CNN_Virus_data')

ReadMe file for directory `data/CNN_Virus_data`:

### CNN Virus data

This directory includes data used to train and validate the initial CNN Virus model, as well as a few smaller datasets for experimenting. 


#### File list and description:
##### 50-mer 
50-mer reads and their labels, in *text format* with one line per sample. Each line consists of three components, separated by tabs: the 50-mer read or sequence, the virus species label and the position label:
```text
'TTACNAGCTCCAGTCTAAGATTGTAACTGGCCTTTTTAAAGATTGCTCTA    94    5\n'
``` 
Files:
- `50mer_training`: dataset with 50,903,296 reads for training
- `50mer_validating`: dataset with 1,000,000 reads for validation
- `50mer_ds_100_reads`: small subset of 100 reads from the validating dataset for experiments

##### 150-mer
150-mer reads and their labels in *text format* in a similar format as above:
```text
'TTCTTTCACCACCACAACCAGTCGGCCGTGGAGAGGCGTCGCCGCGTCTCGTTCGTCGAGGCCGATCGACTGCCGCATGAGAGCGGGTGGTATTCTTCCGAAGACGACGGAGACCGGGACGGTGATGAGGAAACTGGAGAGAGCCACAAC    6    0\n'
```
Files:
- `ICTV_150mer_benchmarking`: dataset with 10,0000 read
- `150mer_ds_100_reads`: small subset of 100 reads from `ICTV_150mer_benchmarking`

##### Longer reads
Reads of various length with no labels, in simple *fasta format*. Each read sequence is preceded by a definition line: `> Sequence n`, where `n` is the sequence number.

Files:
- `training_sequences_300bp.fasta`: dataset with 9,000 300-mer reads
- `training_sequences_500bp.fasta`: dataset with 9,000 500-mer reads
- `validation_sequences.fasta`: dataset with 564 reads of mixed lengths ranging from 163-mer to 497-mer

##### Other files:
- `virus_name_mapping`: mapping between virus species and their numerical label
- `weight_of_classes`:  weights for each virus species class in the training dataset



## Data for simulated reads

In [10]:
# | echo: false
pfs.readme(pfs.data / 'ncbi')

ReadMe file for directory `data/ncbi`:

### NCBI Data

This directory includes all data related to the work done with CoV sequences from NCBI. The data is organized in the following subfolders:

- `refsequences`: reference CoV sequences downloaded from NCBI, and related metadata
- `simreads`: all data from simulated reads, using ART Illumina simulator and the reference sequences
- `infer_results`: results from the inference using models with the simulated reads
- `ds`: datasets in proper format for training or inference/prediction using the CNN Virus model


In [11]:
# | echo: false
pfs.readme(pfs.data / 'ncbi/refsequences')

ReadMe file for directory `data/ncbi/refsequences`:

### NCBI reference CoV sequences

This directory includes several CoV sequences in fasta files, retrieved from the NCBI database:
- the main file including all reference sequences: `cov_virus_sequences.fa`
- smaller files with a reduced number of sequences for testing code: `cov_virus_sequences_*-seqs.fa`

#### `cov_virus_sequences.fa`
- includes 3,318 sequences of corona virus with different types of hosts.
- the names of the virus species is listed in the file `cov_virus_sequences.txt`, in the same directory
- the length of each sequence varies between 751 and 33,576 bases


#### `cov_virus_sequences_*-seqs.fa`
Smaller files are also available. Each of which includes a limited number of sequences to test the code on smaller datasets. The files are named `cov_virus_sequences_*-seqs.fa` , where `*` is the number of sequences in the file:
- `cov_virus_sequences_001-seq1.fa` includes 1 sequence
- `cov_virus_sequences_001-seq2.fa` includes 1 other sequence
- `cov_virus_sequences_002-seqs.fa` includes 2 sequences
- `cov_virus_sequences_010-seqs.fa` includes 10 sequences
- `cov_virus_sequences_025-seqs.fa` includes 25 sequences
- `cov_virus_sequences_100-seqs.fa` includes 100 sequences


#### File Format:
Like all fasta files, each sequence is preceded by a *Definition Line* starting with the character `>`. In the case of our NCBI downloaded sequences:

```ascii
>2591237:ncbi:1    [MK211378]    2591237    ncbi    1    [MK211378]    2591237    Coronavirus BtRs-BetaCoV/YN2018D    scient
TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTAGCTGTCGCTCGGCTGCATGCCTA ...
>11128:ncbi:2    [LC494191]
CATCCCGCTTCACTGATCTCTTGTTAGATCTTTTCATAATCTAAACTTTATAAAAACATCCACTCCCTGTAGTCTATGCCTATGGGCGTAGATTTTTCATAGTGGTGTCT ...
>31631:ncbi:3 [KY967361]    31631    ncbi    3 [KY967361] 31631    Human coronavirus OC43    scientific name
ATCTCTTGTTAGATCTTTTTGTAATCTAAACTTTATAAAAACATCCACTCCCTGTAATCTATGCTTGTGGGCGTAGATTTTTCATAGTGGTGTTTATATTCATTTCTGCT ...
>277944:ncbi:4 [LC654455]    277944    ncbi    4 [LC654455] 277944    Human coronavirus NL63    scientific name
ATTTTCTTATTTAGACTTTGTGTCTACTCTTCTCAACTAAACGAAATTTTTCTAGTGCTGTCATTTGTTATGGCAGTCCTAGTGTAATTGAAATTTCGTCAAGTTTGTAA ...
>11120:ncbi:5 [MN987231]    11120    ncbi    5 [MN987231] 11120    Infectious bronchitis virus    scientific name
TCCTAAGTGTGATATAAATATATATCATACACACTAGCCTTGCGCTAGATTTCTAACTTAACAAAACGGACTTAAATACCTACAGCTGGTCCCTATAGGTGTTCCATTGC ...

....

>2697049:ncbi:3318 [OM062573]    2697049    ncbi    3318 [OM062573] 2697049    Severe acute respiratory syndrome coronavirus 2        scientific name
```

**Metadata** can be parsed from the definition line for further use.

|item |name| example |
|:---:|:--:|:-------|
|sequence id | seqid | 2697049:ncbi:3318|
|accession |accession | OM062573|
|taxonomy id |taxonomyid | 2697049|
|source |source| ncbi| 
|sequence nbr |seqnb | 3318|
|specie name |species| Severe acute respiratory syndrome coronavirus 2|

Example 1:
- Definition Line:
```ascii
    >2591237:ncbi:1 [MK211378]    2591237    ncbi    1 [MK211378] 2591237    Coronavirus BtRs-BetaCoV/YN2018D        scientific name
```
- Parsed metadata:
    - `seqid` = `2591237:ncbi:1`
    - `taxonomyid` = `2591237`
    - `source` = `ncbi`
    - `seqnb` = `1`
    - `accession` = `MK211378`
    - `species` = `Coronavirus BtRs-BetaCoV/YN2018D`

Example 2:
- Definition Line
```ascii
    >11128:ncbi:2 [LC494191]
```
- Parsed metadata:
    - `seqid` = `11128:ncbi:2`
    - `taxonomyid` = `11128`
    - `source` = `ncbi`
    - `seqnb` = `2`
    - `accession` = `LC494191`
    - `species` = `''`
    

In [12]:
# | echo: false
pfs.readme(pfs.data / 'ncbi/simreads')

ReadMe file for directory `data/ncbi/simreads`:

### CoV simulated reads
This directory includes a set of simulated read sequence files generated from NCBI CoV sequences using  ARC Illumina. 

```ascii
this-directory
    |
    |--single_10seq_50bp
    |    |--single_10seq_50bp.fq
    |    |--single_10seq_50bp.alnEnd
    |--single_100seq_50bp
    |    |--single_100seq_50bp.fq
    |    |--single_100seq_50bp.aln
    |--single_100seq_150bp
    |    |--single_100seq_150bp.fq
    |    |--single_100seq_150bp.aln
    |--paired_100seq_50bp
    |    |--paired_100seq_50bp2.aln
    |    |--paired_100seq_50bp1.aln
    |    |--paired_100seq_50bp2.fq
    |    |--paired_100seq_50bp1.fq
    |-- ...
```

Each simread sub-directory is named as `<method>_<nb-seq>_<nb-bp>` where"
- `<method>` is either `single` or `paired` depending on the simulation method
- `<nb-seq>` is the number of reference sequences used for simulation, and refers to the `fa` file used
- `<nb-bp>` is the number of base pairs used to simulate reads


Each sub-directory includes simreads files made using a simulation method and a specific number of reference sequences.
- `xxx.fq` and `xxx.aln` files when method is `single`
- `xxx1.fq`, `xxx2.fq`, `xxx1.aln` and `xxx2.aln` files when method is `paired`.

Example:
- `paired_10seq_50bp` means that the simreads were generated by using the `paired` method to simulate 50-bp reads, and using the `fa` file `cov_virus_sequences_010-seqs.fa`.
- `single_100seq_50bp` means that the simreads were generated by using the `single` method to simulate 50-bp reads, and using the `fa` file `cov_virus_sequences_100-seqs.fa`. Note that this generated 20,660,104 reads !

#### Simread file formats

Simulated reads information is split between two files:
- **FASTQ** (`.fq`) files providing the read sequences and their ASCII quality scores
- **ALN** (`.aln`) files with alignment information

##### FASTQ (`.fq`)
FASTQ files generated by ART Illumina have the following structure (showing 5 reads), with 4 lines for each read:

```ascii
@2591237:ncbi:1-60400
ACAACTCCTATTCGTAGTTGAAGTTGTTGACAAATACTTTGATTGTTACG
+
CCCBCGFGBGGGGGGGBGGGGGGGGG>GGG1G=/GGGGGGGGGGGGGGGG
@2591237:ncbi:1-60399
GATCAATGTGGCATCTACAATACAGACAGCATGAAGCACCACCAAAGGAC
+
BCBCCFGGGGGGGG1CGGGG<GGBGGGGGFGCGGGGGGDGGG/GG1GGGG
@2591237:ncbi:1-60398
ATCTACCAGTGGTAGATGGGTTCTTAATAATGAACATTATAGAGCTCTAC
+
CCCCCGGGEGG1GGF1G/GGEGGGGGGGGGGGGFFGGGGGGGGGGDGGDG
@2591237:ncbi:1-60397
CGTAAAGTAGAGGCTGTATGGTAGCTAGCACAAATGCCAGCACCAATAGG
+
BCCCCGGGFGGGGGGFGGGGFGG1GGGGGGG>GG1GGGGGGGGGGE<GGG
@2591237:ncbi:1-60396
GGTATCGGGTATCTCCTGCATCAATGCAAGGTCTTACAAAGATAAATACT
+
CBCCCGGG@CGGGGGGGGGGGG=GFGGGGDGGGFG1GGGGGGGG@GGGGG
```
The following information can be parsed from the each read sequence in the FASTQ file:

- Line 1: `readid`, a unique ID for the read, under for format `@readid` 
- Line 2: `readseq`, the sequence of the read
- Line 3: a separator `+`
- Line 4: `read_qscores`, the base quality scores encoded in ASCII 

Example:
```
@2591237:ncbi:1-60400
ACAACTCCTATTCGTAGTTGAAGTTGTTGACAAATACTTTGATTGTTACG
+
CCCBCGFGBGGGGGGGBGGGGGGGGG>GGG1G=/GGGGGGGGGGGGGGGG
```
- `readid` = `2591237:ncbi:1-60400`
- `readseq` = `ACAACTCCTATTCGTAGTTGAAGTTGTTGACAAATACTTTGATTGTTACG`, a 50 bp read
- `read_qscores` = `CCCBCGFGBGGGGGGGBGGGGGGGGG>GGG1G=/GGGGGGGGGGGGGGGG`


#### ALN (`.aln`) 
ALN files generated by ART Illumina consist of :
- a header with the ART-Ilumina command used for the simulation (`@CM`) and info on each of the reference sequences used for the simulations (`@SQ`). Header always starts with `##ART_Illumina` and ends with `##Header End` :
- the body with 3 lines for each read:
    1. definition line with `readid`, 
        - reference sequence identification number `refseqid`, 
        - the position in the read in the reference sequence `aln_start_pos` 
        - the strand the read was taken from `ref_seq_strand`. `+` for coding strand and `-` for template strand
    2. aligned reference sequence, that is the sequence segment in the original reference corresponding to the read
    3. aligned read sequence, that is the simmulated read sequence, where each bp corresponds to the reference sequence bp in the same position.

Example of a ALN file generated by ART Illumina (showing 5 reads):

```ascii
##ART_Illumina    read_length    50
@CM    /bin/art_illumina -i /home/vtec/projects/bio/metagentools/data/cov_data/cov_virus_sequences_ten.fa -ss HS25 -l 50 -f 100 -o /home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp -rs 1674660835
@SQ    2591237:ncbi:1 [MK211378]    2591237    ncbi    1 [MK211378] 2591237    Coronavirus BtRs-BetaCoV/YN2018D    30213
@SQ    11128:ncbi:2 [LC494191]    11128    ncbi    2 [LC494191] 11128    Bovine coronavirus    30942
@SQ    31631:ncbi:3 [KY967361]    31631    ncbi    3 [KY967361] 31631    Human coronavirus OC43        30661
@SQ    277944:ncbi:4 [LC654455]    277944    ncbi    4 [LC654455] 277944    Human coronavirus NL63    27516
@SQ    11120:ncbi:5 [MN987231]    11120    ncbi    5 [MN987231] 11120    Infectious bronchitis virus    27617
@SQ    28295:ncbi:6 [KU893866]    28295    ncbi    6 [KU893866] 28295    Porcine epidemic diarrhea virus    28043
@SQ    28295:ncbi:7 [KJ645638]    28295    ncbi    7 [KJ645638] 28295    Porcine epidemic diarrhea virus    27998
@SQ    28295:ncbi:8 [KJ645678]    28295    ncbi    8 [KJ645678] 28295    Porcine epidemic diarrhea virus    27998
@SQ    28295:ncbi:9 [KR873434]    28295    ncbi    9 [KR873434] 28295    Porcine epidemic diarrhea virus    28038
@SQ    1699095:ncbi:10 [KT368904]    1699095    ncbi    10 [KT368904] 1699095    Camel alphacoronavirus    27395
##Header End
>2591237:ncbi:1    2591237:ncbi:1-60400    14770    +
ACAACTCCTATTCGTAGTTGAAGTTGTTGACAAATACTTTGATTGTTACG
ACAACTCCTATTCGTAGTTGAAGTTGTTGACAAATACTTTGATTGTTACG
>2591237:ncbi:1    2591237:ncbi:1-60399    17012    -
GATCAATGTGGCATCTACAATACAGACAGCATGAAGCACCACCAAAGGAC
GATCAATGTGGCATCTACAATACAGACAGCATGAAGCACCACCAAAGGAC
>2591237:ncbi:1    2591237:ncbi:1-60398    9188    +
ATCTACCAGTGGTAGATGGGTTCTTAATAATGAACATTATAGAGCTCTAC
ATCTACCAGTGGTAGATGGGTTCTTAATAATGAACATTATAGAGCTCTAC
.....
```

In [13]:
#| echo: false
pfs.readme(pfs.data / 'ncbi/infer_results')

ReadMe file for directory `data/ncbi/infer_results`:

### CoV Virus Inference Results
This folder includes results from inference using CoV simulated read sequences from `fq` and `aln` files in `cov_simreads`.

#### `cnn_virus`

The directory `cnn_virus` includes results from inference made with the original pretrained model. 

Results are saved into many individual `parquet` files during inference. Then they are merged into a single `parquet` file. 

Each inference experiment receives a unique 8-character unique ID (UID).

Each inference experiment will therefore generate a set of files like follows, where `xxxxxxxx` is the experiment UID and `nnnn` is the index of a partial result file:

- `results_nnnn_infer_xxxxxxxx.parquet`
- `results_all_infer_xxxxxxxx.parquet`

`results_all_infer_xxxxxxxx.parquet` is the file consolidating all results for one inference experiment into a single file

In [14]:
#| echo: false
pfs.readme(pfs.data / 'ncbi/ds')

ReadMe file for directory `data/ncbi/ds`:

### Inference and Training Datasets

When using simread files (`fa` and `aln`) for inference, an inference dataset in a format required by the CNN Virus model must be build. In addition, metadata can be extracted to make it easier to analyse the result from different perspectives.

This directory includes the generated inference datasets and metadata for each inference experiment.



## Model related data

In [15]:
#| echo: false
pfs.readme(pfs.data / 'saved')

ReadMe file for directory `data/saved`:

### Saved data related to models

This directory includes all data related to models and saved:
- saved model parameters
- saved datasets

For example:
- `cnn_virus_original/pretrained_model.h5` is the saved model parameters for the CNN Virus model
- `cnn_virus_datasets/*.tfrecords` are the preprocessed datasets used for inference or training, saved in TFRecord format for performance



# Parsing sequence files

The following classes make it easier to read and parse files of different formats into their underlying components to generated the training, validation, testing and inference datasets for the model.

Each class inherits from `TextFileBaseReader` and adds:

- One or several text parsing method(s) to parse metadata according to a specific format
- A file parsing method to extract metadata from all elements in the file, returning it as a key:value dictionary and optionally save the metadata as a json file.

## FASTA file

Extension of `TextFileBaseReader` class for fasta sequence files.

Structure of a FASTA sequence file:

In [16]:
#| echo: false
p2fasta = Path('data_dev/cov_virus_sequences_two.fa').resolve()

it = TextFileBaseReader(p2fasta, nlines=1)
for i, t in enumerate(it):
    txt = t.replace('\n', '')[:80]
    print(f"{txt}")

>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-Be
TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT
>11128:ncbi:2 [LC494191]
CATCCCGCTTCACTGATCTCTTGTTAGATCTTTTCATAATCTAAACTTTATAAAAACATCCACTCCCTGTAGTCTATGCC


In [17]:
#| export
class FastaFileReader(TextFileBaseReader):
    """Wrap a FASTA file and retrieve its content in raw format and parsed format"""
    def __init__(
        self,
        path: str|Path,  # path to the Fasta file
    ):
        super().__init__(path, nlines=1)
        self.text_to_parse_key = 'definition line'
        self.set_parsing_rules(verbose=False)
        
    def __next__(self)-> dict[str, str]:   # `{'definition line': text in dfn line, 'sequence': full sequence as str}` 
        """Return one definition line and the corresponding sequence"""
        lines = []
        for i in range(2):
            lines.append(self._safe_readline())
        dfn_line = lines[0].replace('\n', '')   #remove the next line symbol at the end of the line
        sequence = lines[1].replace('\n', '')   #remove the next line symbol at the end of the line
        return {'definition line':dfn_line, 'sequence':f"{sequence}"}
    
    def print_first_chunks(
        self, 
        nchunks:int=3,  # number of chunks to print out
    ):
        """Print the first `nchunks` chunks of text from the file"""
        self.reset_iterator()
        for i, seq_dict in enumerate(self.__iter__()):
            print(f"\nSequence {i+1}:")
            print(seq_dict['definition line'])
            print(f"{seq_dict['sequence'][:80]} ...")
            if i >= nchunks-1: break
        self.reset_iterator()
            
    def parse_file(
        self,
        add_seq :bool=False,     # When True, add the full sequence to the parsed metadata dictionary
        save_json: bool=False    # When True, save the file metadata as a json file of same stem name
    )-> dict[str]:               # Metadata as Key/Values pairs
        """Read fasta file and return a dictionary with definition line metadata and optionally sequences"""
    
        self.reset_iterator()
        parsed = {}
        for d in self:
            dfn_line = d['definition line']
            seq = d['sequence']
            metadata = self._parse_text_fn(dfn_line, self.re_pattern, self.re_keys)
            if add_seq: metadata['sequence'] = seq         
            parsed[metadata['seqid']] = metadata
                        
        if save_json:
            p2json = self.path.parent / f"{self.path.stem}_metadata.json"
            with open(p2json, 'w') as fp:
                json.dump(parsed, fp, indent=4)
                print(f"Metadata for '{self.path.name}'> saved as <{p2json.name}> in  \n{p2json.parent.absolute()}\n")

        return parsed

In [18]:
show_doc(FastaFileReader)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L39){target="_blank" style="float:right; font-size:smaller"}

### FastaFileReader

>      FastaFileReader (path:str|pathlib.Path)

Wrap a FASTA file and retrieve its content in raw format and parsed format

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| path | str \| Path | path to the Fasta file |

As an iterator, `FastaFileReader` returns a `dict` at each step, as follows:
```python
{
    'definition line': 'string in file as the definition line for the sequence',
    'sequence': 'the full sequence'
}
```

Illustration:

In [19]:
p2fasta = Path('data_dev/cov_virus_sequences_two.fa')
it = FastaFileReader(p2fasta)
iteration_output = next(it)

print(iteration_output['definition line'][:80], '...')
print(iteration_output['sequence'][:80], '...')

>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-Be ...
TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT ...


In [20]:
print(f"output type :     {type(iteration_output)}")
print(f"keys :            {iteration_output.keys()}")
print(f"definition line : {iteration_output['definition line'][:80]} ...'")
print(f"sequence :       '{iteration_output['sequence'][:100]} ...'")

output type :     <class 'dict'>
keys :            dict_keys(['definition line', 'sequence'])
definition line : >2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-Be ...'
sequence :       'TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTAGCTGTCGCTCGGC ...'


The `definition line` is a string, with tab separated values.

In [21]:
display(iteration_output['definition line'])

'>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] 2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific name'

In [22]:
show_doc(FastaFileReader.print_first_chunks)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L58){target="_blank" style="float:right; font-size:smaller"}

### FastaFileReader.print_first_chunks

>      FastaFileReader.print_first_chunks (nchunks:int=3)

Print the first `nchunks` chunks of text from the file

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| nchunks | int | 3 | number of chunks to print out |

This is convenient to quickly discover and explore new fasta files in raw text format:

In [23]:
it = FastaFileReader(p2fasta)
it.print_first_chunks(nchunks=2)


Sequence 1:
>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name
TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT ...

Sequence 2:
>11128:ncbi:2 [LC494191]
CATCCCGCTTCACTGATCTCTTGTTAGATCTTTTCATAATCTAAACTTTATAAAAACATCCACTCCCTGTAGTCTATGCC ...


### Parsing metadata

The class also provides methods to parse metadata from the file content.

A regex pattern is used for parsing metadata fom the definition lines in the reference sequence fasta file (rule `fasta_cov_ncbi`):

Sequence 1:

- Definition Line:
```ascii
>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus YN2018D		scientific name
```
- Metadata:
    - `seqid` = `2591237:ncbi:1`
    - `taxonomyid` = `2591237`
    - `source` = `ncbi`
    - `seqnb` = `1`
    - `accession` = `MK211378`
    - `species` = `Coronavirus BtRs-BetaCoV/YN2018D`

Sequence 2:

- Definition Line
```ascii
    >11128:ncbi:2 [LC494191]
```

- Metadata:
    - `seqid` = `11128:ncbi:2`
    - `taxonomyid` = `11128`
    - `source` = `ncbi`
    - `seqnb` = `2`
    - `accession` = `LC494191`
    - `species` = `''`

`FastaFileReader` offers:
- `parse_text` a method to parse the metadata
- an option to set a default "parsing rule" for one instance with `set_parsing_rules`.
- `parse_file` a method to parse the metadata from all sequences in the file and save it as a json file.

In [24]:
show_doc(FastaFileReader.parse_text)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.parse_text

>      TextFileBaseReader.parse_text (txt:str, pattern:str=None,
>                                     keys:list[str]=None)

Parse text using regex pattern and key. Return a metadata dictionary

The passed text is parsed using the regex pattern. The method return a dictionary in the format:
    {
        'key_1': 'metadata 1',
        'key_2': 'metadata 2',
        ...
    }

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| txt | str |  | text to parse |
| pattern | str | None | If None, uses standard regex pattern to extract metadata, otherwise, uses passed regex |
| keys | list | None | If None, uses standard regex list of keys, otherwise, uses passed list of keys (str) |
| **Returns** | **dict** |  | **parsed metadata in key/value format** |

Running the parser function with specifically defined `pattern` and `keys`.

In [25]:
it = FastaFileReader(p2fasta)
dfn_line, sequence = next(it).values()
print(dfn_line.replace('\n', ''))

>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name


In [26]:
pattern = r"^>(?P<seqid>(?P<taxonomyid>\d+):(?P<source>ncbi):(?P<seqnb>\d*))[\s\t]*\[(?P<accession>[\w\d]*)\]([\s\t]*(?P=taxonomyid)[\s\t]*(?P=source)[\s\t]*(?P=seqnb)[\s\t]*\[(?P=accession)\][\s\t]*(?P=taxonomyid)[\s\t]*(?P<species>[\w\s\-\_\/]*))?"

keys = 'seqid taxonomyid accession source seqnb species'.split(' ')

In [27]:
it.parse_text(dfn_line, pattern=pattern, keys=keys)

{'accession': 'MK211378',
 'seqid': '2591237:ncbi:1',
 'seqnb': '1',
 'source': 'ncbi',
 'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific name',
 'taxonomyid': '2591237'}

When a `FastaFileReader` instance is created, all existing rules in the file `default_parsing_rules.json` are tested on the first definition line of the fasta file and the one rule that parses the most matches will be selected automatically and saved in instance attributes `re_rule_name`, `re_pattern` and `re_keys`. 

`parse_file` extract metadata from each definition line in the fasta file and return a dictionary with all metadata.

In [28]:
print(it.re_rule_name)
print(it.re_pattern)
print(it.re_keys)

fasta_cov_ncbi
^>(?P<seqid>(?P<taxonomyid>\d+):(?P<source>ncbi):(?P<seqnb>\d*))[\s\t]*\[(?P<accession>[\w\d]*)\]([\s\t]*(?P=taxonomyid)[\s\t]*(?P=source)[\s\t]*(?P=seqnb)[\s\t]*\[(?P=accession)\][\s\t]*(?P=taxonomyid)[\s\t]*(?P<species>[\w\s\-\_\/]*))?
['seqid', 'taxonomyid', 'source', 'accession', 'seqnb', 'species']


In [29]:
it.parse_text(dfn_line)

{'accession': 'MK211378',
 'seqid': '2591237:ncbi:1',
 'seqnb': '1',
 'source': 'ncbi',
 'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific name',
 'taxonomyid': '2591237'}

When another fasta file, which has another definition line structure, is used, another parsing rule is selected.

In [30]:
p2other = Path('data_dev/another_sequence.fa')
assert p2other.is_file()

it2 = FastaFileReader(path=p2other)

dfn_line, sequence = next(it2).values()
print(dfn_line.replace('\n', ''))

>1 dna_rm:primary_assembly primary_assembly:mRhiFer1_v1.p:1:1:124933378:1 REF


In [31]:
print(it2.re_rule_name)
print(it2.re_pattern)
print(it2.re_keys)

fasta_rhinolophus_ferrumequinum
^>\d[\s\t](?P<seq_type>dna_rm):(?P<id_type>[\w\_]*)[\s\w](?P=id_type):(?P<assy>[\w\d\_]*)\.(?P<seq_level>[\w]*):\d*:\d*:(?P<taxonomy>\d*):(?P<id>\d*)[\s	]REF$
['seq_type', 'id_type', 'assy', 'seq_level', 'taxonomy', 'id']


In [32]:
pprint(it2.parse_text(dfn_line))

{'assy': 'mRhiFer1_v1',
 'id': '1',
 'id_type': 'primary_assembly',
 'seq_level': 'p',
 'seq_type': 'dna_rm',
 'taxonomy': '124933378'}


This rule selection is performed by the class method `set_parsing_rule`. The method can also be called with specific `pattern` and `keys` to force parsing rule not yet saved in the json file.

In [33]:
show_doc(FastaFileReader.set_parsing_rules)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.set_parsing_rules

>      TextFileBaseReader.set_parsing_rules (pattern:str|bool=None,
>                                            keys:list[str]=None,
>                                            verbose:bool=False)

Set the standard regex parsing rule for the file.

Rules can be set:

1. manually by passing specific custom values for `pattern` and `keys`
2. automatically, by testing all parsing rules saved in `parsing_rule.json` 

Automatic selection of parsing rules works by testing each rule saved in `parsing_rule.json` on the first 
definition line of the fasta file, and selecting the one rule that generates the most metadata matches.

Rules consists of two parameters:

- The regex pattern including one `group` for each metadata item, e.g `(?P<group_name>regex_code)`
- The list of keys, i.e. the list with the name of each regex groups, used as key in the metadata dictionary

This method updates the three following class attributes: `re_rule_name`, `re_pattern`, `re_keys`

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| pattern | str \| bool | None | regex pattern to apply to parse the text, search in parsing rules json if None |
| keys | list | None | list of keys/group for regex, search in parsing rules json if None |
| verbose | bool | False | when True, provides information on each rule |
| **Returns** | **None** |  |  |

In [34]:
it = FastaFileReader(p2fasta)
dfn_line, sequence = next(it).values()
print(f"definition line: '{dfn_line[:-1]}'")

definition line: '>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific nam'


Automatic parsing works by testing each saved rule for the value of `definition line` in the first sequence in the fasta file.

In [35]:
print(f"key for text to parse: {it.text_to_parse_key}\n")
it.reset_iterator()
print('Text to parse for testing (extracted from first iteration):')
print(next(it)[it.text_to_parse_key])
print()
it.set_parsing_rules(verbose=True)

key for text to parse: definition line

Text to parse for testing (extracted from first iteration):
>2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name

--------------------------------------------------------------------------------
Rule <fasta_cov_ncbi> generated 6 matches
--------------------------------------------------------------------------------
^>(?P<seqid>(?P<taxonomyid>\d+):(?P<source>ncbi):(?P<seqnb>\d*))[\s\t]*\[(?P<accession>[\w\d]*)\]([\s\t]*(?P=taxonomyid)[\s\t]*(?P=source)[\s\t]*(?P=seqnb)[\s\t]*\[(?P=accession)\][\s\t]*(?P=taxonomyid)[\s\t]*(?P<species>[\w\s\-\_\/]*))?
['seqid', 'taxonomyid', 'source', 'accession', 'seqnb', 'species']
--------------------------------------------------------------------------------
Rule <fasta_rhinolophus_ferrumequinum> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <fastq_art_illumina> generated a

If no saved rule generates a match, `re_rule_name`, `re_pattern` and `re_keys` remain `None` and a warning message is issued to ask user to add a parsing rule manually. 

In [36]:
it2 = FastaFileReader('data_dev/sequences_two_no_matching_rule.fa')

        None of the saved parsing rules were able to extract metadata from the first line in this file.
        You must set a custom rule (pattern + keys) before parsing text, by using:
            `self.set_parsing_rules(custom_pattern, custom_list_of_keys)`
                


In [37]:
it2.re_rule_name is None

True

But we still can set a standard rule manually, by passing a re pattern and the corresponding list of keys.

In [38]:
pat = r"^>(?P<seqid>(?P<taxonomyid>\d+):(?P<source>ncbi):(?P<seqnb>\d*))\s*(?P<text>[\w\s]*)$"
keys = "seqid taxonomyid source seqnb text".split()
it2.set_parsing_rules(pattern=pat, keys=keys)

print(it2.re_rule_name)
print(it2.re_pattern)
print(it2.re_keys)

Custom Rule
^>(?P<seqid>(?P<taxonomyid>\d+):(?P<source>ncbi):(?P<seqnb>\d*))\s*(?P<text>[\w\s]*)$
['seqid', 'taxonomyid', 'source', 'seqnb', 'text']


In [39]:
it2.reset_iterator()
dfn_line, sequence = next(it2).values()
print(f"definition line: '{dfn_line[:-1]}'")
it2.parse_text(dfn_line)

definition line: '>2591237:ncbi:1 this sequence does not match any saved parsing rul'


{'seqid': '2591237:ncbi:1',
 'seqnb': '1',
 'source': 'ncbi',
 'taxonomyid': '2591237',
 'text': 'this sequence does not match any saved parsing rule'}

In [40]:
show_doc(FastaFileReader.parse_file)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L71){target="_blank" style="float:right; font-size:smaller"}

### FastaFileReader.parse_file

>      FastaFileReader.parse_file (add_seq:bool=False, save_json:bool=False)

Read fasta file and return a dictionary with definition line metadata and optionally sequences

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| add_seq | bool | False | When True, add the full sequence to the parsed metadata dictionary |
| save_json | bool | False | When True, save the file metadata as a json file of same stem name |
| **Returns** | **dict[str]** |  | **Metadata as Key/Values pairs** |

In [41]:
it = FastaFileReader(p2fasta)
pprint(it.parse_file())

{'11128:ncbi:2': {'accession': 'LC494191',
                  'seqid': '11128:ncbi:2',
                  'seqnb': '2',
                  'source': 'ncbi',
                  'species': None,
                  'taxonomyid': '11128'},
 '2591237:ncbi:1': {'accession': 'MK211378',
                    'seqid': '2591237:ncbi:1',
                    'seqnb': '1',
                    'source': 'ncbi',
                    'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific '
                               'name',
                    'taxonomyid': '2591237'}}


In [42]:
it.parse_file(save_json=True);

Metadata for 'cov_virus_sequences_two.fa'> saved as <cov_virus_sequences_two_metadata.json> in  
/home/vtec/projects/bio/metagentools/nbs-dev/data_dev



In [43]:
with open('../default_parsing_rules.json', 'r') as fp:
    pprint(json.load(fp), width=20)

{'aln_art_illumina': {'keys': 'refseqid '
                              'reftaxonomyid '
                              'refsource '
                              'refseqnb '
                              'readid '
                              'readnb '
                              'aln_start_pos '
                              'refseq_strand',
                      'pattern': '^>(?P<refseqid>(?P<reftaxonomyid>\\d*):(?P<refsource>\\w*):(?P<refseqnb>\\d*))(\\s|\t'
                                 ')*(?P<readid>(?P=reftaxonomyid):(?P=refsource):(?P=refseqnb)-(?P<readnb>\\d*(\\/\\d(-\\d)?)?))(\\s|\t'
                                 ')(?P<aln_start_pos>\\d*)(\\s|\t'
                                 ')(?P<refseq_strand>(-|\\+))$'},
 'aln_art_illumina-refseq': {'keys': 'refseqid '
                                     'reftaxonomyid '
                                     'refsource '
                                     'refseqnb '
                                     'refseq_accession '
  

In [44]:
p2fasta = Path('data_dev/cov_virus_sequence_one.fa').resolve()
it = FastaFileReader(p2fasta)
fasta_meta = it.parse_file(save_json=True)
pprint(fasta_meta)

Metadata for 'cov_virus_sequence_one.fa'> saved as <cov_virus_sequence_one_metadata.json> in  
/home/vtec/projects/bio/metagentools/nbs-dev/data_dev

{'2591237:ncbi:1': {'accession': 'MK211378',
                    'seqid': '2591237:ncbi:1',
                    'seqnb': '1',
                    'source': 'ncbi',
                    'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific '
                               'name',
                    'taxonomyid': '2591237'}}


## FASTQ file

Extension of `TextFileBaseReader` class for fastq sequence files.

Structure of a FASTQ sequence file:

In [45]:
#| echo: false
p2fastq = Path('data_dev/single_1seq_150bp/single_1seq_150bp.fq').resolve()

it = TextFileBaseReader(p2fastq, nlines=1)
for i, t in enumerate(it):
    txt = t.replace('\n', '')[:80]
    print(f"{txt}")
    if i >= 11: break

@2591237:ncbi:1-20100
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTG
+
CC=GGGG8GGGGG=JJJGJJJJJGJJJCJG1JJGJJGGJJJCJGGGGJGJJJGG=GJGGGJG=GGGGG=CGGCCCGGG8G
@2591237:ncbi:1-20099
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAA
+
CCC1GGGGGGGGGJGJJJJJJJ1J=GJC=JJJJJJJJJGGGJJCGJGJJGJCJJ=JJJ=JG8GJJGJGGCJCCGCGGGGC
@2591237:ncbi:1-20098
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCAGCTGGTGATGATACT
+
CCCCGGGGGGGGGGJJJGGJGCJJJCCJJGJGJCJG8GGJCJJJ8GJJJCJJGGGJGGG=GCGJC(CCCGGCCGCCGCCG


In [46]:
#| export
class FastqFileReader(TextFileBaseReader):
    """Iterator going through a fastq file's sequences and return each section + prob error as a dict"""
    def __init__(
        self,
        path:str|Path,   # path to the fastq file
    )-> dict:           # key/value with keys: definition line; sequence; q score; prob error
        self.nlines = 4
        super().__init__(path, nlines=self.nlines)
        self.text_to_parse_key = 'definition line'
        self.set_parsing_rules(verbose=False)        
    
    def __next__(self):
        """Return definition line, sequence and quality scores"""
        lines = []
        for i in range(self.nlines):
            lines.append(self._safe_readline().replace('\n', ''))
        
        output = {
            'definition line':lines[0], 
            'sequence':f"{lines[1]}", 
            'read_qscores': f"{lines[3]}",
        }
        output['probs error'] = np.array([q_score2prob_error(q) for q in output['read_qscores']])
        
        return output
    
    def print_first_chunks(
        self, 
        nchunks:int=3,  # number of chunks to print out
    ):
        """Print the first `nchunks` chunks of text from the file"""
        for i, seq_dict in enumerate(self.__iter__()):
            print(f"\nSequence {i+1}:")
            print(seq_dict['definition line'])
            print(f"{seq_dict['sequence'][:80]} ...")
            if i >= nchunks: break
            
    def parse_file(
        self,
        add_readseq :bool=False,    # When True, add the full sequence to the parsed metadata dictionary
        add_qscores:bool=False,     # Add the read ASCII Q Scores to the parsed dictionary when True
        add_probs_error:bool=False, # Add the read probability of error to the parsed dictionary when True
        save_json: bool=False       # When True, save the file metadata as a json file of same stem name
    )-> dict[str]:                  # Metadata as Key/Values pairs
        """Read fastq file, return a dict with definition line metadata and optionally read sequence and q scores, ..."""
    
        self.reset_iterator()
        parsed = {}
        for d in self:
            dfn_line = d['definition line']
            seq, q_scores, prob_e = d['sequence'], d['read_qscores'], d['probs error']
            metadata = self._parse_text_fn(dfn_line, self.re_pattern, self.re_keys)
            if add_readseq: metadata['readseq'] = seq         
            if add_qscores: metadata['read_qscores'] = q_scores
            if add_probs_error: metadata['probs error'] = prob_e
            parsed[metadata['readid']] = metadata 
                        
        if save_json:
            p2json = self.path.parent / f"{self.path.stem}_metadata.json"
            with open(p2json, 'w') as fp:
                json.dump(parsed, fp, indent=4)
                print(f"Metadata for '{self.path.name}'> saved as <{p2json.name}> in  \n{p2json.parent.absolute()}\n")

        return parsed

In [47]:
show_doc(FastqFileReader)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L96){target="_blank" style="float:right; font-size:smaller"}

### FastqFileReader

>      FastqFileReader (path:str|pathlib.Path)

Iterator going through a fastq file's sequences and return each section + prob error as a dict

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| path | str \| Path | path to the fastq file |
| **Returns** | **dict** | **key/value with keys: definition line; sequence; q score; prob error** |

In [48]:
it = FastqFileReader(p2fastq)
iteration_output = next(it)

print(type(iteration_output))
print(iteration_output.keys())
print(f"Definition line:  {iteration_output['definition line']}")
print(f"Read sequence:    {iteration_output['sequence']}")
print(f"Q scores (ASCII): {iteration_output['read_qscores']}")
print(f"Prob error:       {','.join([f'{p:.4f}' for p in iteration_output['probs error']])}")

<class 'dict'>
dict_keys(['definition line', 'sequence', 'read_qscores', 'probs error'])
Definition line:  @2591237:ncbi:1-20100
Read sequence:    TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCTGATTCTAGTTCATGCTCCGATAATTCGGTAGCATCACCAAGCCAGTCCTC
Q scores (ASCII): CC=GGGG8GGGGG=JJJGJJJJJGJJJCJG1JJGJJGGJJJCJGGGGJGJJJGG=GJGGGJG=GGGGG=CGGCCCGGG8GGGGGGGCGCGG=G1G=GCGGCCGGGG=CC=8G=GGGCGGG=GGGGCGGCGGGGCCGGCGCCGGCCGGGCG
Prob error:       0.0004,0.0004,0.0016,0.0002,0.0002,0.0002,0.0002,0.0050,0.0002,0.0002,0.0002,0.0002,0.0002,0.0016,0.0001,0.0001,0.0001,0.0002,0.0001,0.0001,0.0001,0.0001,0.0001,0.0002,0.0001,0.0001,0.0001,0.0004,0.0001,0.0002,0.0251,0.0001,0.0001,0.0002,0.0001,0.0001,0.0002,0.0002,0.0001,0.0001,0.0001,0.0004,0.0001,0.0002,0.0002,0.0002,0.0002,0.0001,0.0002,0.0001,0.0001,0.0001,0.0002,0.0002,0.0016,0.0002,0.0001,0.0002,0.0002,0.0002,0.0001,0.0002,0.0016,0.0002,0.0002,0.0002,0.0002,0.0002,0.0016,0.0004,0.0002,0.0002,0.0004,0.00

Five largest probabilities of error:

In [49]:
np.sort(iteration_output['probs error'])[-5:]

array([0.00501187, 0.00501187, 0.00501187, 0.02511886, 0.02511886])

In [50]:
np.argsort(iteration_output['probs error'])[-5:]

array([  7, 110,  78,  93,  30])

In [51]:
dfn_line = iteration_output['definition line']
meta = it.parse_text(dfn_line)
meta

{'readid': '2591237:ncbi:1-20100',
 'readnb': '20100',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}

In [52]:
fastq = FastqFileReader(p2fastq)
next(fastq).keys()

dict_keys(['definition line', 'sequence', 'read_qscores', 'probs error'])

In [53]:
show_doc(FastqFileReader.parse_file)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L133){target="_blank" style="float:right; font-size:smaller"}

### FastqFileReader.parse_file

>      FastqFileReader.parse_file (add_readseq:bool=False,
>                                  add_qscores:bool=False,
>                                  add_probs_error:bool=False,
>                                  save_json:bool=False)

Read fastq file, return a dict with definition line metadata and optionally read sequence and q scores, ...

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| add_readseq | bool | False | When True, add the full sequence to the parsed metadata dictionary |
| add_qscores | bool | False | Add the read ASCII Q Scores to the parsed dictionary when True |
| add_probs_error | bool | False | Add the read probability of error to the parsed dictionary when True |
| save_json | bool | False | When True, save the file metadata as a json file of same stem name |
| **Returns** | **dict[str]** |  | **Metadata as Key/Values pairs** |

In [54]:
parsed = fastq.parse_file(add_readseq=False, add_qscores=False, add_probs_error=False)
for i, (k, v) in enumerate(parsed.items()):
    print(k)
    pprint(v)
    if i >=3: break

2591237:ncbi:1-20100
{'readid': '2591237:ncbi:1-20100',
 'readnb': '20100',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20099
{'readid': '2591237:ncbi:1-20099',
 'readnb': '20099',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20098
{'readid': '2591237:ncbi:1-20098',
 'readnb': '20098',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20097
{'readid': '2591237:ncbi:1-20097',
 'readnb': '20097',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}


In [55]:
metadata = it.parse_file(add_readseq=True)
df = pd.DataFrame(metadata).T
df.head(10)

Unnamed: 0,readid,readnb,refseqnb,refsource,reftaxonomyid,readseq
2591237:ncbi:1-20100,2591237:ncbi:1-20100,20100,1,ncbi,2591237,TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCA...
2591237:ncbi:1-20099,2591237:ncbi:1-20099,20099,1,ncbi,2591237,TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAAT...
2591237:ncbi:1-20098,2591237:ncbi:1-20098,20098,1,ncbi,2591237,CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCC...
2591237:ncbi:1-20097,2591237:ncbi:1-20097,20097,1,ncbi,2591237,ATGTAAAAGTGTTACCATCACAAGTGTTCTTGTAGGTACCATAATC...
2591237:ncbi:1-20096,2591237:ncbi:1-20096,20096,1,ncbi,2591237,AGAAGCACCAGCACATATGTCAACAATAGGTGTCTGCACAATGACT...
2591237:ncbi:1-20095,2591237:ncbi:1-20095,20095,1,ncbi,2591237,GACTGGTTTGTAAAAATTGGACCTCGCAAGTCTGCTCGCCTAGTAC...
2591237:ncbi:1-20094,2591237:ncbi:1-20094,20094,1,ncbi,2591237,TGCTTGTGTTTTCCACATAGGCAGCCATAAGATCCTCATGACCTAA...
2591237:ncbi:1-20093,2591237:ncbi:1-20093,20093,1,ncbi,2591237,CTGCTGACATTGTAGTCTTTGATGAAATCTCTATGGCTACCAATTA...
2591237:ncbi:1-20092,2591237:ncbi:1-20092,20092,1,ncbi,2591237,CTCATCAACTGGCACTTTCTTCAAAGCTCTTGAGAGCATCTCTGTA...
2591237:ncbi:1-20091,2591237:ncbi:1-20091,20091,1,ncbi,2591237,TGCTACAGCTCATAGCGAGCTGGCAAAGGGTGTAGCTTTAGATGGT...


In [56]:
fastq.set_parsing_rules(verbose=True)

--------------------------------------------------------------------------------
Rule <fasta_cov_ncbi> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <fasta_rhinolophus_ferrumequinum> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <fastq_art_illumina> generated 5 matches
--------------------------------------------------------------------------------
^@(?P<readid>(?P<reftaxonomyid>\d*):(?P<refsource>\w*):(?P<refseqnb>\d*)-(?P<readnb>\d*))$
['readid', 'reftaxonomyid', 'refsource', 'refseqnb', 'readnb']
--------------------------------------------------------------------------------
Rule <aln_art_illumina> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <aln_art_illumina-refseq> generated an error
No match on this line
----------------------------------

## ALN Alignment Files

Extension of `TextFileBaseReader` class for ALN read/sequence alignment files.

Structure of a ALN sequence file:

In [57]:
#| echo: false
p2aln = Path('data_dev/single_1seq_150bp/single_1seq_150bp.aln').resolve()
assert p2aln.is_file()

it = TextFileBaseReader(p2aln, nlines=1)
for i, t in enumerate(it):
    txt = t.replace('\n', '')[:80]
    print(f"{txt}")
    if i >= 12: break

##ART_Illumina	read_length	150
@CM	/bin/art_illumina -i /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/c
@SQ	2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs
##Header End
>2591237:ncbi:1	2591237:ncbi:1-20100	26865	-
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTG
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTG
>2591237:ncbi:1	2591237:ncbi:1-20099	7219	-
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAA
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAA
>2591237:ncbi:1	2591237:ncbi:1-20098	25514	-
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCATCTGGTGATGATACT
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCAGCTGGTGATGATACT


In [92]:

#| export
class AlnFileReader(TextFileBaseReader):
    """Iterator going through an ALN file"""
    def __init__(
        self,
        path:str|Path,   # path to the aln file
    )-> dict:            # key/value with keys: 
        """Set TextFileBaseReader attributes and specific class attributes"""
        self.nlines = 1
        super().__init__(path, nlines=self.nlines)
        self.header = self.read_header()
        self.nlines = 3
        self.text_to_parse_key = 'definition line'
        self.set_parsing_rules(verbose=False)
        self.set_header_parsing_rules(verbose=False)
        self.ref_sequences = self.parse_header_reference_sequences()

    def __next__(self):
        """Return definition line, sequence and quality scores"""
        lines = []
        for i in range(self.nlines):
            lines.append(self._safe_readline().replace('\n', ''))

        output = {
            'definition line':lines[0], 
            'ref_seq_aligned':f"{lines[1]}", 
            'read_seq_aligned': f"{lines[2]}",
        }   
        return output
    
    def read_header(self):
        """Read ALN file Header and return each section parsed in a dictionary"""
        
        header = {}
        if self.fp is not None:
            self.fp.close()
        self.fp = open(self.path, 'r')
        
        line = self._safe_readline().replace('\n', '')
        if not line.startswith('##ART_Illumina'): 
            raise ValueError(f"Header of this file does not start with ##ART_Illumina")
        line = self._safe_readline().replace('\n', '')
        if not line.startswith('@CM'): 
            raise ValueError(f"First header line should start with @CM")
        else: 
            header['command'] = line[3:].replace('\t', '').strip()

        refseqs = []
        while True:
            line = self._safe_readline().replace('\n', '')
            if line.startswith('##Header End'): break
            else:
                refseqs.append(line)
        header['reference sequences'] = refseqs
        
        return header
    
    def reset_iterator(self):
        """Reset the iterator to point to the first line in the file, by recreating a new file handle.
        
        `AlnFileReader` requires a specific `reset_iterator` method, in order to skip the header every time it is reset
        """
        if self.fp is not None:
            self.fp.close()
        self.fp = open(self.path, 'r')
        while True:
            line = self._safe_readline().replace('\n', '')
            if line.startswith('##Header End'): break

    def parse_definition_line_with_position(
        self, 
        dfn_line:str    # fefinition line string to be parsed
        )-> dict:       # parsed metadata in key/value format + relative position of the read
        """Parse definition line and adds relative position"""
        read_meta = self.parse_text(dfn_line)
        read_refseqid = read_meta['refseqid']
        read_start_pos = int(read_meta['aln_start_pos'])
        read_refseq_lentgh = int(self.ref_sequences[read_refseqid]['refseq_length'])
        read_meta['read_pos'] = (read_start_pos *10)// read_refseq_lentgh + 1
        return read_meta
    
    def parse_file(
        self, 
        add_ref_seq_aligned:bool=False,   # Add the reference sequence aligned to the parsed dictionary when True
        add_read_seq_aligned:bool=False,  # Add the read sequence aligned to the parsed dictionary when True
    )-> dict[str]: 
        # Key/Values. Keys: 
        # `readid`,`seqid`,`seq_nbr`,`read_nbr`,`aln_start_pos`,`ref_seq_strand`
        # optionaly `ref_seq_aligned`,`read_seq_aligned`
        """Read ALN file, return a dict w/ alignment info for each read and optionaly aligned reference sequence & read"""
        self.reset_iterator()
        parsed = {}
        for d in self:
            dfn_line = d['definition line']
            ref_seq_aligned, read_seq_aligned = d['ref_seq_aligned'], d['read_seq_aligned']
            metadata = self.parse_text(dfn_line)
            if add_ref_seq_aligned: metadata['ref_seq_aligned'] = ref_seq_aligned         
            if add_read_seq_aligned: metadata['read_seq_aligned'] = read_seq_aligned
            parsed[metadata['readid']] = metadata 
        return parsed

    def parse_header_reference_sequences(
        self,
        pattern:str|None=None,     # regex pattern to apply to parse the reference sequence info
        keys:list[str]|None=None,  # list of keys: keys are both regex match group names and corresponding output dict keys 
        )->dict[str]:                  # parsed metadata in key/value format
        """Extract metadata from all header reference sequences"""
        if pattern is None and keys is None:
            pattern, keys = self.re_header_pattern, self.re_header_keys
        parsed = {}
        for seq_dfn_line in self.header['reference sequences']:
            metadata = self.parse_text(seq_dfn_line, pattern, keys)
            parsed[metadata['refseqid']] = metadata
            
        return parsed       
        
    def set_header_parsing_rules(
        self,
        pattern: str|bool=None,   # regex pattern to apply to parse the text, search in parsing rules json if None
        keys: list[str]=None,     # list of keys/group for regex, search in parsing rules json if None
        verbose: bool=False       # when True, provides information on each rule
    )-> None:
        """Set the regex parsing rule for reference sequence in ALN header.
               
        Updates 3 class attributes: `re_header_rule_name`, `re_header_pattern`, `re_header_keys`
        
        TODO: refactor this and the method in Core: to use a single function for the common part and a parameter for the text_to_parse 
        """
        
        P2JSON = Path(f"{PACKAGE_ROOT}/default_parsing_rules.json")
        
        self.re_header_rule_name = None
        self.re_header_pattern = None
        self.re_header_keys = None
        
        # get the first reference sequence definition line in header
        text_to_parse = self.header['reference sequences'][0]
        divider_line = f"{'-'*80}"

        if pattern is not None and keys is not None:  # When specific pattern and keys are passed
            try:
                metadata_dict = self.parse_text(text_to_parse, pattern, keys)
                self.re_header_rule_name = 'Custom Rule'
                self.re_header_pattern = pattern
                self.re_header_keys = keys
                if verbose:
                    print(divider_line)
                    print(f"Custom rule was set for header in this instance.")
            except Exception as err: 
                raise ValueError(f"The pattern generates the following error:\n{err}")
                
        else:  # automatic rule selection among rules saved in json file
            # Load all existing rules from json file
            with open(P2JSON, 'r') as fp:
                parsing_rules = json.load(fp)
                
            # test all existing rules and keep the one with highest number of matches
            max_nbr_matches = 0
            for k, v in parsing_rules.items():
                re_header_pattern = v['pattern']
                re_header_keys = v['keys'].split(' ')
                try:
                    metadata_dict = self.parse_text(text_to_parse, re_header_pattern, re_header_keys)
                    nbr_matches = len(metadata_dict)
                    if verbose:
                        print(divider_line)
                        print(f"Rule <{k}> generated {nbr_matches:,d} matches")
                        print(divider_line)
                        print(re_header_pattern)
                        print(re_header_keys)

                    if len(metadata_dict) > max_nbr_matches:
                        self.re_header_pattern = re_header_pattern
                        self.re_header_keys = re_header_keys
                        self.re_header_rule_name = k    
                except Exception as err:
                    if verbose:
                        print(divider_line)
                        print(f"Rule <{k}> generated an error")
                        print(err)
                    else:
                        pass
            if self.re_header_rule_name is None:
                msg = """
        None of the saved parsing rules were able to extract metadata from the first line in this file.
        You must set a custom rule (pattern + keys) before parsing text, by using:
            `self.set_parsing_rules(custom_pattern, custom_list_of_keys)`
                """
                warnings.warn(msg, category=UserWarning)
            
            if verbose:
                print(divider_line)
                print(f"Selected rule with most matches: {self.re_header_rule_name}")

            # We used the iterator, now we need to reset it to make all lines available
            self.reset_iterator()

In [93]:
show_doc(AlnFileReader)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L162){target="_blank" style="float:right; font-size:smaller"}

### AlnFileReader

>      AlnFileReader (path:str|pathlib.Path)

Iterator going through an ALN file

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| path | str \| Path | path to the aln file |
| **Returns** | **dict** | **key/value with keys:** |

In [94]:
it = AlnFileReader(p2aln)

`AlnFileReader` iterator returns elements one by one, as dictionaries with each data line related to the read, accessible through the following keys: 

- key `'definition line'`: **read definition line**, including read metadata 
- key `'ref_seq_aligned'`: **aligned reference sequence**, that is the sequence segment in the original reference corresponding to the read
- key `'read_seq_aligned'`: **aligned read**, that is the simmulated read sequence, where each bp corresponds to the reference sequence bp in the same position.

In [95]:
one_iteration = next(it)
one_iteration.keys()

dict_keys(['definition line', 'ref_seq_aligned', 'read_seq_aligned'])

In [96]:
pprint(one_iteration)

{'definition line': '>2591237:ncbi:1\t2591237:ncbi:1-20100\t26865\t-',
 'read_seq_aligned': 'TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCTGATTCTAGTTCATGCTCCGATAATTCGGTAGCATCACCAAGCCAGTCCTC',
 'ref_seq_aligned': 'TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCTGATTCTAGTTCATGCTCCGATAATTCGGTAGCATCACCAAGCCAGTCCTC'}


In [97]:
dfn_line, ref_seq_aligned, read_seq_aligned = one_iteration.values()

In [98]:
dfn_line

'>2591237:ncbi:1\t2591237:ncbi:1-20100\t26865\t-'

In [99]:
ref_seq_aligned[:100]

'TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCT'

In [100]:
read_seq_aligned[:100]

'TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCT'

In [101]:
another_iteration = next(it)
pprint(another_iteration)

{'definition line': '>2591237:ncbi:1\t2591237:ncbi:1-20099\t7219\t-',
 'read_seq_aligned': 'TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAATAGGCACATTAGGATAGAAGTCATAAGTACTAAGAGTACGTACACCATTTTCGTCAGAAGTTAGATCCCT',
 'ref_seq_aligned': 'TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAATAGGCACATTAGGATAGAAGTCATAAGTACTAAGAGTACGTACACCATTTTCGTCAGAAGTTAGATCCCT'}


In [102]:
it.reset_iterator()
for i, d in enumerate(it):
    print(d['definition line'])
    print(d['ref_seq_aligned'][:80], '...')
    print(d['read_seq_aligned'][:80], '...\n')
    if i >= 3: break

>2591237:ncbi:1	2591237:ncbi:1-20100	26865	-
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTG ...
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTG ...

>2591237:ncbi:1	2591237:ncbi:1-20099	7219	-
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAA ...
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAA ...

>2591237:ncbi:1	2591237:ncbi:1-20098	25514	-
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCATCTGGTGATGATACT ...
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCAGCTGGTGATGATACT ...

>2591237:ncbi:1	2591237:ncbi:1-20097	17747	-
ATGTAAAAGTGTTACCATCACAAGTGTTCTTGTAGGTACCATAATCAGGGACAACAACCATAAGTTTGGCTGCTGTAGTC ...
ATGTAAAAGTGTTACCATCACAAGTGTTCTTGTAGGTACCATAATCAGGGACAACAACCATAAGTTTGGCTGCTGTAGTC ...



Once instantiated, the `AlnFileReader` iterator gives access to the file's header information through `header` instance attribute. It is a dictionary with two keys: `'command'` and `'reference sequences'`:

```
    {'command':             'art-illumina command used to create the reads',
     'reference sequences': ['@SQ metadata on reference sequence 1 used for the reads',
                             '@SQ metadata on reference sequence 2 used for the reads', 
                             ...
                            ]
    }
```

In [103]:
print(it.header['command'])

/bin/art_illumina -i /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/cov_virus_sequence_one.fa -ss HS25 -l 150 -f 100 -o /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1seq_150bp/single_1seq_150bp -rs 1704169422


In [104]:
for seq_info in it.header['reference sequences']:
    print(seq_info)

@SQ	2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name	30213


The **read definition line** includes key metadata, which need to be parsed using the appropriate parsing rule.

In [105]:
show_doc(AlnFileReader.parse_text)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/core.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### TextFileBaseReader.parse_text

>      TextFileBaseReader.parse_text (txt:str, pattern:str=None,
>                                     keys:list[str]=None)

Parse text using regex pattern and key. Return a metadata dictionary

The passed text is parsed using the regex pattern. The method return a dictionary in the format:
    {
        'key_1': 'metadata 1',
        'key_2': 'metadata 2',
        ...
    }

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| txt | str |  | text to parse |
| pattern | str | None | If None, uses standard regex pattern to extract metadata, otherwise, uses passed regex |
| keys | list | None | If None, uses standard regex list of keys, otherwise, uses passed list of keys (str) |
| **Returns** | **dict** |  | **parsed metadata in key/value format** |

In [106]:
#| hide
pattern, keys = it.re_pattern, it.re_keys

In [107]:
it.parse_text(dfn_line, pattern, keys)

{'aln_start_pos': '26865',
 'readid': '2591237:ncbi:1-20100',
 'readnb': '20100',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}

In [108]:
show_doc(AlnFileReader.parse_definition_line_with_position)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L230){target="_blank" style="float:right; font-size:smaller"}

### AlnFileReader.parse_definition_line_with_position

>      AlnFileReader.parse_definition_line_with_position (dfn_line:str)

Parse definition line and adds relative position

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| dfn_line | str | fefinition line string to be parsed |
| **Returns** | **dict** | **parsed metadata in key/value format + relative position of the read** |

Upon instance creation, `AlnFileReader` automatically checks the `default_parsing_rules.json` file for a workable rule among saved rules. Saved rules include the rule for ART Illumina ALN files.

In [109]:
it.re_rule_name

'aln_art_illumina'

It is therefore not required to pass a specific `pattern` and `keys` parameter.


In [110]:
it.parse_text(dfn_line)

{'aln_start_pos': '26865',
 'readid': '2591237:ncbi:1-20100',
 'readnb': '20100',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}

ART Ilumina ALN files definition lines consist of:

- The **read** ID: `readid`, e.g. `2591237:ncbi:1-20100`
- the **read** number (order in the file): `readnb`, e.g. `20100`
- The **read** start position in the reference sequence: `aln_start_pos`, e.g. `23878`
- The **reference sequence** ID: `readid`, e.g. `2591237:ncbi:1-20100`
- The **reference sequence** number: `refseqnb`, e.g. `1`
- The **reference sequence** source: `refsource`, e.g. `ncbi`
- The **reference sequence** taxonomy: `reftaxonomyid`, e.g. `2591237`
- The **reference sequence** strand:  `refseq_strand` wich is either `+` or  `-`,


In [111]:
show_doc(AlnFileReader.parse_file)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L239){target="_blank" style="float:right; font-size:smaller"}

### AlnFileReader.parse_file

>      AlnFileReader.parse_file (add_ref_seq_aligned:bool=False,
>                                add_read_seq_aligned:bool=False)

Read ALN file, return a dict w/ alignment info for each read and optionaly aligned reference sequence & read

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| add_ref_seq_aligned | bool | False | Add the reference sequence aligned to the parsed dictionary when True |
| add_read_seq_aligned | bool | False | Add the read sequence aligned to the parsed dictionary when True |
| **Returns** | **dict[str]** |  |  |

In [112]:
parsed = it.parse_file()

for i, (k, v) in enumerate(parsed.items()):
    print(k)
    pprint(v)
    if i > 3: break

2591237:ncbi:1-20100
{'aln_start_pos': '26865',
 'readid': '2591237:ncbi:1-20100',
 'readnb': '20100',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20099
{'aln_start_pos': '7219',
 'readid': '2591237:ncbi:1-20099',
 'readnb': '20099',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20098
{'aln_start_pos': '25514',
 'readid': '2591237:ncbi:1-20098',
 'readnb': '20098',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20097
{'aln_start_pos': '17747',
 'readid': '2591237:ncbi:1-20097',
 'readnb': '20097',
 'refseq_strand': '-',
 'refseqid': '2591237:ncbi:1',
 'refseqnb': '1',
 'refsource': 'ncbi',
 'reftaxonomyid': '2591237'}
2591237:ncbi:1-20096
{'aln_start_pos': '19819',
 'readid': '2591237:ncbi:1-20096',
 'readnb': '20

In [113]:
show_doc(AlnFileReader.parse_header_reference_sequences)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L259){target="_blank" style="float:right; font-size:smaller"}

### AlnFileReader.parse_header_reference_sequences

>      AlnFileReader.parse_header_reference_sequences (pattern:str|None=None,
>                                                      keys:list[str]|None=None)

Extract metadata from all header reference sequences

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| pattern | str \| None | None | regex pattern to apply to parse the reference sequence info |
| keys | list[str] \| None | None | list of keys: keys are both regex match group names and corresponding output dict keys |
| **Returns** | **dict[str]** |  | **parsed metadata in key/value format** |

In [114]:
pprint(it.parse_header_reference_sequences())

{'2591237:ncbi:1': {'refseq_accession': 'MK211378',
                    'refseq_length': '30213',
                    'refseqid': '2591237:ncbi:1',
                    'refseqnb': '1',
                    'refsource': 'ncbi',
                    'reftaxonomyid': '2591237',
                    'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific '
                               'name'}}


In [115]:
show_doc(AlnFileReader.set_header_parsing_rules)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L274){target="_blank" style="float:right; font-size:smaller"}

### AlnFileReader.set_header_parsing_rules

>      AlnFileReader.set_header_parsing_rules (pattern:str|bool=None,
>                                              keys:list[str]=None,
>                                              verbose:bool=False)

Set the regex parsing rule for reference sequence in ALN header.

Updates 3 class attributes: `re_header_rule_name`, `re_header_pattern`, `re_header_keys`

TODO: refactor this and the method in Core: to use a single function for the common part and a parameter for the text_to_parse

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| pattern | str \| bool | None | regex pattern to apply to parse the text, search in parsing rules json if None |
| keys | list[str] | None | list of keys/group for regex, search in parsing rules json if None |
| verbose | bool | False | when True, provides information on each rule |
| **Returns** | **None** |  |  |

In [116]:
it.set_header_parsing_rules(verbose=True)

--------------------------------------------------------------------------------
Rule <fasta_cov_ncbi> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <fasta_rhinolophus_ferrumequinum> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <fastq_art_illumina> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <aln_art_illumina> generated an error
No match on this line
--------------------------------------------------------------------------------
Rule <aln_art_illumina-refseq> generated 7 matches
--------------------------------------------------------------------------------
^@SQ[\t\s]*(?P<refseqid>(?P<reftaxonomyid>\d*):(?P<refsource>\w*):(?P<refseqnb>\d*))[\t\s]*\[(?P<refseq_accession>[\d\w]*)\][\t\s]*(?P=reftaxonomyid)[\s\t]*(?P=refsource)[\s\t]*(?P=refseqn

In [117]:
print(it.re_header_rule_name)
print(it.re_header_pattern)
print(it.re_header_keys)

aln_art_illumina-refseq
^@SQ[\t\s]*(?P<refseqid>(?P<reftaxonomyid>\d*):(?P<refsource>\w*):(?P<refseqnb>\d*))[\t\s]*\[(?P<refseq_accession>[\d\w]*)\][\t\s]*(?P=reftaxonomyid)[\s\t]*(?P=refsource)[\s\t]*(?P=refseqnb)[\s\t]*\[(?P=refseq_accession)\][\s\t]*(?P=reftaxonomyid)[\s\t]*(?P<species>\w[\w\d\/\s\-\.]*)[\s\t](?P<refseq_length>\d*)$
['refseqid', 'reftaxonomyid', 'refsource', 'refseqnb', 'refseq_accession', 'species', 'refseq_length']


# Build datasets

Sequence and reads are provided in various formats (text for original data, fastq + aln for simulated reads) and the model expects a specific format for training, validation and testing datasets.

The following functions allow to build the datasets in the format expected by the model from the raw data available.

In addition, text based dataset are not efficient, especially for training. Additional functions allow to save and parse dataset in TFRecord format.

There are two pipelines for building inference datasets:
- via a text inference dataset, in the same format as the original paper's data
- via a TFRecord inference dataset for faster operations.

## Text based inference datasets

In this pipeline, the steps are:

1. Create a text inference file and a metadata file from FASTQ and ALN with `create_infer_ds_from_fastq`
2. Create a `tf.data.TextLineDataset` from the text inference dataset
3. Transform it into an inference/training dataset with `.map` and `strings_to_tensors)`

In [118]:
#| export
def create_infer_ds_from_fastq(
    p2fastq: str|Path,             # Path to the fastq file (aln file path is inferred)
    output_dir:str|Path|None=None, # Path to directory where ds file will be saved
    overwrite_ds:bool=False,       # If True, overwrite existing ds file. If False, error is raised if ds file exists
    nsamples:int|None=None         # Used to limit the number of reads to use for inference, use all if None
)-> (Path, Path, pd.DataFrame):    # Paths to dataset file, path to metadata file, dataframe with metadata
    """Build an inference dataset file as required by the CNN Virus model from a simreads fastq (ART format).
    
    Also extract the fastq read sequence metadata, saves it in a metadata file and returns them as a DataFrame
    """
    fastq = FastqFileReader(p2fastq)
    aln = AlnFileReader(p2fastq.parent / f"{p2fastq.stem}.aln")
    
    if output_dir is None:
        p2outdir = Path()
    else:
        validate_path(output_dir, path_type='dir', raise_error=True)
        p2outdir = output_dir if isinstance(output_dir, Path) else Path(output_dir)
    
    p2dataset = p2outdir / f"{p2fastq.stem}_ds"
    p2metadata = p2outdir / f"{p2fastq.stem}_metadata.csv"
    
    if p2dataset.is_file():
        if overwrite_ds: 
            p2dataset.unlink()
            if p2metadata.is_file(): p2metadata.unlink()
        else:
            raise ValueError(f"{p2dataset.name} already exists in {p2dataset.absolute()}")
    p2dataset.touch()
    p2metadata.touch()
    
    read_ids = []
    read_refseqs = []
    read_start_pos = []
    read_strand = []
    
    with open(p2dataset, 'a') as fp:
        i = 1
        for fastq_chunk, aln_chunk in tqdm(zip(fastq, aln)):
            seq = fastq_chunk['sequence']
            
            aln_meta = aln.parse_text(aln_chunk['definition line'])
            read_ids.append(aln_meta['readid'])
            read_refseqs.append(aln_meta['refseqid'])
            read_start_pos.append(aln_meta['aln_start_pos'])
            read_strand.append(aln_meta['refseq_strand'])

            fp.write(f"{seq}\t{0}\t{0}\n")

            i += 1
            if nsamples:
                if i > nsamples: break
                    
    print(f"Dataset with {i-1:,d} reads")

    metadata = np.array(list(zip(read_ids, read_refseqs, read_start_pos, read_strand)))
    metadata = pd.DataFrame(data={
                'read_ids': read_ids,
                'read_refseqs': read_refseqs,
                'read_start_pos': read_start_pos,
                'read_strand': read_strand})
    metadata.to_csv(p2metadata, index=True)
    
    return p2dataset, p2metadata, metadata

In [119]:
show_doc(create_infer_ds_from_fastq)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L356){target="_blank" style="float:right; font-size:smaller"}

### create_infer_ds_from_fastq

>      create_infer_ds_from_fastq (p2fastq:str|pathlib.Path,
>                                  output_dir:str|pathlib.Path|None=None,
>                                  overwrite_ds:bool=False,
>                                  nsamples:int|None=None)

Build an inference dataset file as required by the CNN Virus model from a simreads fastq (ART format).

Also extract the fastq read sequence metadata, saves it in a metadata file and returns them as a DataFrame

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| p2fastq | str \| Path |  | Path to the fastq file (aln file path is inferred) |
| output_dir | str \| Path \| None | None | Path to directory where ds file will be saved |
| overwrite_ds | bool | False | If True, overwrite existing ds file. If False, error is raised if ds file exists |
| nsamples | int \| None | None | Used to limit the number of reads to use for inference, use all if None |
| **Returns** | **(Path, Path, pd.DataFrame)** |  | **Paths to dataset file, path to metadata file, dataframe with metadata** |

In [120]:
path2ds, path2meta, meta = create_infer_ds_from_fastq(
    p2fastq=p2fastq, 
    output_dir=Path('data_dev'),
    overwrite_ds=True, 
    nsamples=100
)

0it [00:00, ?it/s]

Dataset with 100 reads


In [121]:
print(f"FASTAQ file name: {p2fastq.absolute()}")
print(f"Path to dataset:  {path2ds.absolute()} \nPath to metadata: {path2meta.absolute()}")

FASTAQ file name: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1seq_150bp/single_1seq_150bp.fq
Path to dataset:  /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1seq_150bp_ds 
Path to metadata: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1seq_150bp_metadata.csv


In [122]:
TextFileBaseReader(path2ds, nlines=5).print_first_chunks(nchunks=1)

5-line chunk 1
TTGCGCCTCCTTCACGATGTCCACACACTTAATGGCAACATTGTCAGTAAGTTTTAAATAACCAGTAAACTGGTTAACTGGTTCCTCAGGTGTTGGTTCTGATTCTAGTTCATGCTCCGATAATTCGGTAGCATCACCAAGCCAGTCCTC	0	0
TAATTTAGGTCCACAAACTGTAGCAGGTGCATTTAGAAGTTCAAATGAAAGCACAACAACCCTAGTAGCCTGATATTCAATAGGCACATTAGGATAGAAGTCATAAGTACTAAGAGTACGTACACCATTTTCGTCAGAAGTTAGATCCCT	0	0
CCACAAAGTGCTCCTCAGATGTCTTTGATGACGAAGTGAGGTATCCATTATATGTAGTAACAGCAGCTGGTGATGATACTGACACTACGGCAGGAGCTTTAAGAGAACGCATACAGCGCGCAGCCTCTTCAAGATTAAAACCATGTGTCA	0	0
ATGTAAAAGTGTTACCATCACAAGTGTTCTTGTAGGTACCATAATCAGGGACAACAACCATAAGTTTGGCTGCTGTAGTCAATGGTATGATGTTGAGTGGAACACAACCATCACGCGCATTGTTGATAATGTTGTTAAGTGCATCATTAT	0	0
AGAAGCACCAGCACATATGTCAACAATAGGTGTCTGCACAATGACTGACATTGCTAAGAAACCTACTGAGAGTGCTTGTTCCTCGCTTACTGTCTTATTTGATGGTAGAGTGGAAGGACAGGTAGACCTTTTTAGAAATGCCCGTAATGG	0	0



In [123]:
meta.head()

Unnamed: 0,read_ids,read_refseqs,read_start_pos,read_strand
0,2591237:ncbi:1-20100,2591237:ncbi:1,26865,-
1,2591237:ncbi:1-20099,2591237:ncbi:1,7219,-
2,2591237:ncbi:1-20098,2591237:ncbi:1,25514,-
3,2591237:ncbi:1-20097,2591237:ncbi:1,17747,-
4,2591237:ncbi:1-20096,2591237:ncbi:1,19819,+


In [124]:
#| export
def strings_to_tensors(
    b: tf.Tensor        # batch of strings 
    ):
    """Function converting a batch of bp strings into three tensors: (x_seqs, (y_labels, y_pos))"""
    
    # Split the string in three : returns a ragged tensor which needs to be converted into a normal tensor using .to_tensor()
    t = tf.strings.split(b, '\t').to_tensor(default_value = '', shape=[None, 3])

    # Split each sequence string into a list of single base strings:
    # 'TCAAAATAATCA' -> ['T','C','A','A','A','A','T','A','A','T','C','A']
    seqs = tf.strings.bytes_split(t[:, 0]).to_tensor(shape=(None, 50))


    # BHE sequences
    # Each base letter (A, C, G, T, N) is replaced by a OHE vector
    #     "A" converted into [1,0,0,0,0]
    #     "C" converted into [0,1,0,0,0]
    #     "G" converted into [0,0,1,0,0]
    #     "T" converted into [0,0,0,1,0]
    #     "N" converted into [0,0,0,0,1]
    # 
    # Technical Notes:
    # a. The batch of sequence `seqs` has a shape (batch_size, 50) after splitting each byte. 
    #    Must flatten it first, then apply the transform on each base, then reshape to original shape
    # b. We need to map each letter to one vector/tensor. 
    #    1. Cast bytes seqs into integer sequence (uint8 to work byte by byte)
    #    2. For each base letter (A, C, G, T, N) create one tensor (batch_size, 50) (seqs_A, _C, _G, _T, _N)
    #    3. Value is 1 if it is the base in the sequence, otherwise 0
    #    4. Concatenate these 5 tensors into a tensor of shape (batch_size, 50, 5)
 
    seqs_uint8 = tf.io.decode_raw(seqs, out_type=tf.uint8)
    # note: tf.io.decode_raw adds one dimension at the end in the process
    #       [b'C', b'A', b'T'] will return [[67], [65], [84]] and not [67, 65, 84]
    #       this is actually what we want to contatenate the values for each base letter

    A, C, G, T, N = 65, 67, 71, 84, 78

    seqs_A = tf.cast(seqs_uint8 == A, tf.float32)
    seqs_C = tf.cast(seqs_uint8 == C, tf.float32)
    seqs_G = tf.cast(seqs_uint8 == G, tf.float32)
    seqs_T = tf.cast(seqs_uint8 == T, tf.float32)
    seqs_N = tf.cast(seqs_uint8 == N , tf.float32)

    x_seqs = tf.concat([seqs_A, seqs_C, seqs_G, seqs_T, seqs_N], axis=2)

    # OHE labels
    n_labels = 187
    y_labels = tf.strings.to_number(t[:, 1], out_type=tf.int32)
    y_labels = tf.gather(tf.eye(n_labels), y_labels)

    # OHE positions
    n_pos = 10
    y_pos = tf.strings.to_number(t[:, 2], out_type=tf.int32)
    y_pos= tf.gather(tf.eye(n_pos), y_pos)

    return (x_seqs, (y_labels, y_pos))

## TFRecord based inference datasets

In this pipeline, the steps are:

1. Go from FASTQ and ALN to a RFRecord file and a metadata file with `tfrecord_from_fastq` or `tfrecord_from_text`
2. Create a `tf.data.TFRecordDataset` from the saved TFRecord file
3. Transform it into an inference/training dataset with `.map` and `tfr_to_tensors`

In [125]:
#| export
def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _serialize_array(array):
  array = serialize_tensor(array)
  return array

In [126]:
#| export
def _base_hot_encode(
    line: str        # one string (one line in text dataset)
    ):
    """Convert a line from text dataset into three tensors: read sequence (BHE), virus label and position"""
    
    # Split the line (string) in three : read, label, position
    t = tf.strings.split(line.replace('\n', ''), '\t')

    # Split the sequence string into a list of single base strings:
    # 'TCAAAATAATCA' -> ['T','C','A','A','A','A','T','A','A','T','C','A']
    read = tf.strings.bytes_split(t[0])

    # Base Hot Encode sequences (BHE)
    # Each base letter (A, C, G, T, N) is replaced by a OHE vector
    #     "A" converted into [1,0,0,0,0]
    #     "C" converted into [0,1,0,0,0]
    #     "G" converted into [0,0,1,0,0]
    #     "T" converted into [0,0,0,1,0]
    #     "N" converted into [0,0,0,0,1]
    
    # Decode the base letters A, C, ... into their ASCII code for easy conversion into BHE
    # ASCII code for A, C, G, T and N:
    A, C, G, T, N = 65, 67, 71, 84, 78
    read_uint8 = tf.io.decode_raw(read, out_type=tf.uint8)

    # Technical Notes: 
    #   tf.io.decode_raw adds one dimension at the end in the process
    #   [b'C', b'A', b'T'] will return [[67], [65], [84]] and not [67, 65, 84]
    #   this is actually what we want to contatenate the values for each base letter
    read_A = tf.cast(read_uint8 == A, tf.float32)
    read_C = tf.cast(read_uint8 == C, tf.float32)
    read_G = tf.cast(read_uint8 == G, tf.float32)
    read_T = tf.cast(read_uint8 == T, tf.float32)
    read_N = tf.cast(read_uint8 == N , tf.float32)
    x_reads = tf.concat([read_A, read_C, read_G, read_T, read_N], axis=1)

    # OHE labels
    n_labels = 187
    y_labels = tf.strings.to_number(t[1], out_type=tf.int32) # int32 so it can be used an index in gather
    y_labels = tf.gather(tf.eye(n_labels, dtype=tf.float32), y_labels)

    # OHE positions
    n_pos = 10
    y_pos = tf.strings.to_number(t[2], out_type=tf.int32) # int32 so it can be used an index in gather
    y_pos= tf.gather(tf.eye(n_pos, dtype=tf.float32), y_pos)

    return x_reads, y_labels, y_pos

In [127]:
#| export
def tfrecord_from_fastq(
    p2fastq:Path,              # Path to the fastaq file (should be associated with a aln file)
    p2tfrds:Path|None=None,    # Path to the TFRecord file, default creates a file in savec directory
    overwrite:bool=False       # When True, overides any existing file, When False, raises an error
    ) -> (Path, Path):         # Paths to the saved TFRecord file and the metadata csv file
    """Creates a TFRecord dataset for inference from fastq and aln files, as well as a csv metadata file

    The TFRecord dataset can be used for training or prediction, using the original CNN Virus model.
    The metadata file is a Pandas DataFrame converted into csv
    """
    # Setup paths
    if p2tfrds is None:
        p2tfrds = ProjectFileSystem().data / 'saved/cnn_virus_datasets' / f"{p2fastq.stem}.tfrecords"
    p2metadata = p2tfrds.parent / f"{p2tfrds.stem}.metadata"

    if p2tfrds.exists():
        if overwrite:
            p2tfrds.unlink()
            if p2metadata.exists(): p2metadata.unlink()
        else: 
            raise ValueError(f"{p2tfrds.name} already exists. To overwrite, set parameter `overwrite` to True")

    p2aln = p2fastq.parent / f"{p2fastq.stem}.aln"
    assert p2aln.is_file(), f"No ALN file associated with {fastq.name}"
    
    fastq = FastqFileReader(p2fastq)
    aln = AlnFileReader(p2aln)
    read_ids, read_refseqs, read_start_pos, read_strand = [], [], [], []
    writer = tf.io.TFRecordWriter(str(p2tfrds.absolute())) 

    for i, (fastq_element, aln_element) in tqdm(enumerate(zip(fastq, aln))):
        # Extract read text sequence from fastq and metadata from aln files
        seq = fastq_element['sequence']           
        aln_meta = aln.parse_text(aln_element['definition line'])
        read_ids.append(aln_meta['readid'])
        read_refseqs.append(aln_meta['refseqid'])
        read_start_pos.append(aln_meta['aln_start_pos'])
        read_strand.append(aln_meta['refseq_strand'])

        # Create and write one Example, including BHE sequence, the label and the position
        bhe_seq, label, pos = _base_hot_encode(f"{seq}\t{0}\t{0}\n")
        data = {
            'read' : _bytes_feature(_serialize_array(bhe_seq)),
            'label' : _bytes_feature(_serialize_array(label)),
            'pos' : _bytes_feature(_serialize_array(pos))
        }
        out = tf.train.Example(features=tf.train.Features(feature=data))
        writer.write(out.SerializeToString())

    writer.close()
    print(f"Wrote {i+1} reads to TFRecord file {p2tfrds.name}")

    metadata = np.array(list(zip(read_ids, read_refseqs, read_start_pos, read_strand)))
    metadata = pd.DataFrame(data={
                'read_ids': read_ids,
                'read_refseqs': read_refseqs,
                'read_start_pos': read_start_pos,
                'read_strand': read_strand})
    metadata.to_csv(p2metadata, index=True)
    
    return p2tfrds, p2metadata, metadata

In [128]:
#| export
def tfrecord_from_text(
    p2ds,                      # Path to the text dataset, in the format of original CNN Virus data
    p2tfrds:Path|None=None,    # Path to the TFRecord file, default creates a file in savec directory
    overwrite:bool=False       # When True, overides any existing file, When False, raises an error
    ) -> Path:                 # Path to the saved TFRecord file
    # Setup paths
    if p2tfrds is None:
        p2tfrds = ProjectFileSystem().data / 'saved/cnn_virus_datasets' / f"{p2ds.stem}.tfrecords"
    # p2metadata = p2tfrds.parent / f"{p2tfrds.stem}.metadata"

    if p2tfrds.exists():
        if overwrite:
            p2tfrds.unlink()
            # if p2metadata.exists(): p2metadata.unlink()
        else: 
            raise ValueError(f"{p2tfrds.name} already exists. To overwrite, set parameter `overwrite` to True")

    reads = TextFileBaseReader(p2ds, nlines=1)
    writer = tf.io.TFRecordWriter(str(p2tfrds.absolute())) 

    for i, line in enumerate(reads):
        # Create and write one Example, including BHE sequence, the label and the position
        bhe_seq, label, pos = _base_hot_encode(line)
        data = {
            'read' : _bytes_feature(_serialize_array(bhe_seq)),
            'label' : _bytes_feature(_serialize_array(label)),
            'pos' : _bytes_feature(_serialize_array(pos))
            }
        out = tf.train.Example(features=tf.train.Features(feature=data))
        writer.write(out.SerializeToString())

    writer.close()
    print(f"Wrote {i+1} reads to TFRecord")
    return p2tfrds

In [129]:
#| export
def _parse_tfr_element(element):
    # Define the underlying structure of the data (mirror the dta structure above)
    data = {    
        'read' : FixedLenFeature([], tf.string),
        'label' : FixedLenFeature([], tf.string),
        'pos' : FixedLenFeature([], tf.string) 
    }

    content = tf.io.parse_single_example(element, data)
  
    read_bytes = content['read']
    label_bytes = content['label']
    pos_bytes = content['pos']
    
    # Parse the string tensor into a real tensors, with proper types
    read = tf.io.parse_tensor(read_bytes, out_type=tf.float32)
    label = tf.io.parse_tensor(label_bytes, out_type=tf.float32)
    pos = tf.io.parse_tensor(pos_bytes, out_type=tf.float32)
    
    return (read, (label, pos))

In [130]:
#| export
def get_dataset_from_tfr(
    p2tfrds:Path   # Path to the TFRecord dataset
    ) -> tf.data.Dataset: # dataset
    # Create a dataset from the TFRecord file
    dataset = tf.data.TFRecordDataset(p2tfrds)
    # Convert the strings into the proper format using the parsing function
    dataset = dataset.map(_parse_tfr_element)
    return dataset

Create a dataset from an existing TFRecord file

In [131]:
# TODO: Check how to define shape of the elements
p2ds = Path('data_dev/single_1seq_50bp-10reads.tfrecords')
ds = get_dataset_from_tfr(p2ds)
ds.element_spec

(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 (TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
  TensorSpec(shape=<unknown>, dtype=tf.float32, name=None)))

In [132]:
for r, (l, p) in ds.take(8):
    print(r.shape, l.shape, p.shape)

(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)
(50, 5) (187,) (10,)


We can convert this dataset into a batch dataset

In [133]:
ds_batched = ds.batch(2)
for r, (l, p) in ds_batched.take(8):
    print(r.shape, l.shape, p.shape)

(2, 50, 5) (2, 187) (2, 10)
(2, 50, 5) (2, 187) (2, 10)
(2, 50, 5) (2, 187) (2, 10)
(2, 50, 5) (2, 187) (2, 10)
(2, 50, 5) (2, 187) (2, 10)


# Original Code (Refactored)

Selected classes and functions refactored from the original code, coming with the paper

In [89]:
#| export
class DataGenerator_from_50mer(Sequence):
    """data generator for generating batches of data from 50-mers"""

    d_nucl = {"A": 0,"C": 1,"G": 2,"T": 3,"N":4}

    def __init__(self, f_matrix, f_labels, f_pos, batch_size=1024,n_classes=187, shuffle=True):
        self.batch_size = batch_size
        self.labels = f_labels
        self.matrix = f_matrix
        self.pos = f_pos
        self.n_classes = n_classes
        self.shuffle = shuffle
        self.on_epoch_end()
    def __len__(self):
        return int(np.ceil(len(self.labels) / self.batch_size))
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        X, y= self.__data_generation(indexes)
        return X,y
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.labels))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    def __data_generation(self, index):
        x_train=[]
        for i in index:
            seq=self.matrix[i]
            seq_list=[j for j in seq]
            x_train.append(seq_list)
        x_train=np.array(x_train)
        x_tensor=np.zeros(list(x_train.shape)+[5])
        for row in range(len(x_train)):
            for col in range(50):
                x_tensor[row,col,self.d_nucl[x_train[row,col]]]=1
        y_pos=[]
        y_label=[self.labels[i] for i in index]
        y_label=np.array(y_label)
        y_label=to_categorical(y_label, num_classes=self.n_classes)
        y_pos=[self.pos[i] for i in index]
        y_pos=np.array(y_pos)
        y_pos=to_categorical(y_pos, num_classes=10)
        return x_tensor,{'labels': y_label, 'pos': y_pos}

In [90]:
show_doc(DataGenerator_from_50mer)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L470){target="_blank" style="float:right; font-size:smaller"}

### DataGenerator_from_50mer

>      DataGenerator_from_50mer (f_matrix, f_labels, f_pos, batch_size=1024,
>                                n_classes=187, shuffle=True)

data generator for generating batches of data from 50-mers

In [91]:
#| export
def get_learning_weights(filepath):
    """get different learning weights for different classes, from file"""
    f = open(filepath,"r").readlines()
    d_weights = {}
    for i in f:
        i = i.strip().split("\t")
        d_weights[float(i[0])]=float(i[1])
    return d_weights

def get_params_50mer():
    """set default params for generating batches of 50-mer"""
    params = {'batch_size': 1024,
    'n_classes': 187,
    'shuffle': True}
    return params

def get_params_150mer():
    """ set default params for generating batches of 150-mer"""
    params = {'batch_size': 101,
    'n_classes': 187,
    'shuffle': False}
    return params

def get_kmer_from_50mer(filepath, max_seqs=None):
    """Load data from sequence file and returns three tensors, with max nbr sequences"""
    f_matrix=[]
    f_labels=[]
    f_pos=[]
    with open(filepath, 'r') as fp:
        i = 0
        while True:
            line = fp.readline()
            i += 1
            # EOF
            if line == '':
                break
            # Reached max number of k-mers to load from file
            elif max_seqs is not None and i > max_seqs:
                break
            else:
                seq, label, pos = line.strip().split('\t')
                f_matrix.append(seq)
                f_labels.append(label)
                f_pos.append(pos)
    return f_matrix,f_labels,f_pos

def get_kmer_from_150mer(filepath, max_seqs=None):
    """Load data from sequence file and returns three tensors, with max nbr sequences"""
    f_matrix=[]
    f_labels=[]
    f_pos=[]
    with open(filepath,"r") as fp:
        i = 0
        while True:
            line = fp.readline()
            i += 1
            # EOF
            if line == '':
                break
            # Reached max number of k-mers to load from file
            elif max_seqs is not None and i > max_seqs:
                break
            else:
                seq, label, pos = line.strip().split('\t')
                # Split 150-mer into 101 50-mers, shifted by one nucleotide
                for i in range(len(seq)-49):
                    kmer=seq[i:i+50]
                    f_matrix.append(kmer)
                    f_labels.append(label)
                    f_pos.append(pos)
    return f_matrix,f_labels,f_pos

In [92]:
show_doc(get_kmer_from_50mer)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L537){target="_blank" style="float:right; font-size:smaller"}

### get_kmer_from_50mer

>      get_kmer_from_50mer (filepath, max_seqs=None)

Load data from sequence file and returns three tensors, with max nbr sequences

In [93]:
show_doc(get_kmer_from_150mer)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/cnn_virus/data.py#L560){target="_blank" style="float:right; font-size:smaller"}

### get_kmer_from_150mer

>      get_kmer_from_150mer (filepath, max_seqs=None)

Load data from sequence file and returns three tensors, with max nbr sequences

In [94]:
# path for the training file
filepath_50mer= Path('data_dev/50mer_ds_100_seq')
filepath_150mer= Path('data_dev/150mer_ds_100_seq')
assert filepath_50mer.is_file(), filepath_50mer
assert filepath_150mer.is_file(), filepath_150mer

In [95]:
f_matrix,f_labels,f_pos = get_kmer_from_50mer(filepath_50mer, max_seqs=5)
f_matrix

['AAAAAGATTTTGAGAGAGGTCGACCTGTCCTCCTAAAACGTTTACAAAAG',
 'CATGTAACGCAGCTTAGTCCGATCGTGGCTATAATCCGTCTTTCGATTTG',
 'AACAACATCTTGTTGATGATAACCGTCAAAGTGTTTTGGGTCTGGAGGGA',
 'AGTACCTGGAGAGCGTTAAGAAACACAAACGGCTGGATGTAGTGCCGCGC',
 'CCACGTCGATGAAGCTCCGACGAGAGTCGGCGCTGAGCCCGCGCACCTCC']

In [96]:
f_labels, f_pos

(['71', '1', '158', '6', '71'], ['0', '7', '6', '7', '6'])

In [97]:
f_matrix,f_labels,f_pos = get_kmer_from_150mer(filepath_150mer, max_seqs=1)
f_matrix[:5]

['CTACATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATG',
 'TACATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATGA',
 'ACATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATGAA',
 'CATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATGAAA',
 'ATGACCCTGACACTCAGCTACGAGATGTCAAATTTTGGGGGCAATGAAAG']

In [98]:
f_labels[:5], f_pos[:5]

(['120', '120', '120', '120', '120'], ['3', '3', '3', '3', '3'])

In [99]:
#| hide
nbdev_export()