In [None]:
#|default_exp art

In [None]:
#|hide
import nbdev

# from __future__ import annotations
from fastcore.test import *
from metagentorch.cnn_virus.utils import setup_nb
from nbdev.showdoc import *

ON_COLAB, _, _ = setup_nb(_dev=True)

assert not ON_COLAB, f"Art Illumina cannot be run on Colab"

%load_ext autoreload
%autoreload 2

Running locally


In [None]:
#|export
# Imports all dependencies
import os
import subprocess
import shlex

from fastcore.basics import patch_to
from fastcore.utils import run, join_path_file
from pathlib import Path
from typing import Tuple, List, Optional

# art

> Use **ART**, next gen read simulation tool, from within a python notebook

`ART` is an open source package simmulation next generation read of genomes, available on the website of the National Institute of Environmental Health Sciences [here](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm). It is a command line interface package. This module makes the package accessible from a jupyter notebook

**Typical usage**

- read simulation with paired reads:
    - `art_illumina -ss HS25 -sam -i file.fa -p -l 150 -f 20 -m 200 -s 10 -o paired_seq_1`
- read simulation  with single reads:
    - `art_illumina -ss HS25 -sam -i file.fa -l 150 -f 10 -o single_seq_1`

Where the parameters are:

```ASCII
  -f   --fcov   the fold of read coverage to be simulated or number of reads/read pairs generated for each amplicon
  -i   --in     the filename of input DNA/RNA reference
  -l   --len    the length of reads to be simulated
  -m   --mflen  the mean size of DNA/RNA fragments for paired-end simulations
  -o   --out    the prefix of output filename
  -p   --paired indicate a paired-end read simulation or to generate reads from both ends of amplicons
                NOTE: art will automatically switch to a mate-pair simulation if the given mean fragment size >= 2000
  -s   --sdev   the standard deviation of DNA/RNA fragment size for paired-end simulations.
  -sam --samout indicate to generate SAM alignment file
  -ss  --seqSys The name of Illumina sequencing system of the built-in profile used for simulation
                NOTE: sequencing system ID names are:
                GA1 - GenomeAnalyzer I (36bp,44bp), GA2 - GenomeAnalyzer II (50bp, 75bp)
                HS10 - HiSeq 1000 (100bp),          HS20 - HiSeq 2000 (100bp),      HS25 - HiSeq 2500 (125bp, 150bp)
                HSXn - HiSeqX PCR free (150bp),     HSXt - HiSeqX TruSeq (150bp),   MinS - MiniSeq TruSeq (50bp)
                MSv1 - MiSeq v1 (250bp),            MSv3 - MiSeq v3 (250bp),        NS50 - NextSeq500 v2 (75bp)
```
Notes:

- For single-end simulation, ART requires input sequence file, output file prefix, read length, and read count/fold coverage.
- For paired-end simulation (except for amplicon sequencing), ART also requires the parameter values of the mean and standard deviation of DNA/RNA fragment lengths

In [None]:
#| export

# Private Utility functions to export ==============================================

def _run(args:List[str], shell:bool=False, print_output=True):
    """Wrapper subprocess.run and prints the output"""
    p = subprocess.run(args=args, stdout=subprocess.PIPE, shell=shell)
    if print_output:
        print('return code: ',p.returncode, '\n')
        print(str(p.stdout, 'utf-8'))

def _validate_path(p:str|Path) -> Path:
    """checks that path is a string or a Path, and returns a Path"""
    if isinstance(p, str): 
        p = Path(p)
    elif not isinstance(p, Path): 
        raise TypeError(f"a path must be a string or a Path, not a {type(p)}")
    return p

In [None]:
#| export
class ArtIllumina:
    """Class to handle all aspects of simulating sequencing with art_illumina"""

    QUALITY_PROFILES = {
        'GA1': 'GenomeAnalyzer I (36bp,44bp)', 'GA2': 'GenomeAnalyzer II (50bp, 75bp)',
        'HS10': 'HiSeq 1000 (100bp)', 'HS20': 'HiSeq 2000 (100bp)',
        'HS25': 'HiSeq 2500 (125bp, 150bp)', 'HSXn': 'HiSeqX PCR free (150bp)', 'HSXt': 'HiSeqX TruSeq (150bp)',
        'MinS': 'MiniSeq TruSeq (50bp)', 'MSv1': 'MiSeq v1 (250bp)', 'MSv3': 'MiSeq v3 (250bp)',
        'NS50': 'NextSeq500 v2 (75bp)'
        }

    def __init__(
        self, 
        path2app: str|Path,            # full path to art_illumina application on the system
        input_dir: str|Path,           # full path to dir where input files are
        output_dir: str|Path=None,     # full path to dir where to save output files, if different from input_dir
        app_in_system_path:bool=False, # whether `art_illumina` is in the system path or not
        ):
        """Initialize the art_illumina instance"""

        # Validate and save paths
        path2app = _validate_path(path2app)
        if app_in_system_path:
            self.app_cmd = 'art_illumina'
        elif path2app.is_file():
            self.app_cmd = str(path2app.absolute())
        else:
            raise ValueError(f"{path2app.name} is not a file, please check the path to the application")

        input_dir = _validate_path(input_dir)
        if input_dir.is_dir():
            self.input_dir = input_dir
        else:
            raise ValueError(f"{input_dir.name} is not a directory, please check the path")

        if output_dir is None: 
            self.output_dir = input_dir
        else:
            output_dir = _validate_path(output_dir)
            if output_dir.is_dir():
                self.output_dir = output_dir
            else:
                raise ValueError(f"{output_dir.name} is not a directory, please check the path")

        print(f"Ready to operate with art: {self.app_cmd}")
        print(f"Input files from : {self.input_dir.absolute()}")
        print(f"Output files to :  {self.output_dir.absolute()}")

    def sim_reads(
        self,
        input_file: str,          # name of the fasta file to use as input
        output_seed: str,         # seed to use for the output files
        sim_type: str='single',   # type of read simmulation: 'single' or 'paired'
        read_length: int=150,     # length of the read in bp
        fold: int=10,             # fold
        mean_read: int=None,      # mean length of the read for paired reads
        std_read: int=None,       # std of the read length, for paired reads
        ss: str='HS25',           # quality profile to use for simulation,
        overwrite: bool=False,    # overwrite existing output files if true, raise error if false 
        print_output:bool=True    # if True, prints art ilumina's CLI output
        ):
        """Simulates reads with art_illumina. Output files saved in a separate directory"""

        # validate input file and save in instance
        path2inputfile = self.input_dir / input_file
        if path2inputfile.is_file(): 
            self.last_input_file = path2inputfile
        else:
            raise ValueError(f"{input_file} is not a file in {self.input_dir}")

        
        # validate output seed and save in instance
        if not overwrite and len(list(self.output_dir.glob(f"{output_seed}*"))) > 0: 
            raise ValueError(f"Existing output directory starting with {output_seed}")
        else:
            self.last_output_seed = output_seed
            self.last_read_output_dir = self.output_dir/self.last_output_seed
            os.makedirs(self.last_read_output_dir, exist_ok=True)

        # validate quality profile
        if ss not in list(self.QUALITY_PROFILES.keys()): 
            raise ValueError(f"{ss} is not a built-in profile.\nPick one of these: {self.QUALITY_PROFILES}")

        # build art_illumina command
        if sim_type == 'single':
            params = f"-ss {ss} -l {read_length} -f {fold}"
        elif sim_type == 'paired':
            if mean_read is None or std_read is None:
                raise ValueError(f"mean_read and std_read are required for a paired reads simulation")
            else:
                params = f"-ss {ss} -p -l {read_length} -f {fold} -m {mean_read} -s {std_read}"
        else:
            raise RuntimeError(f"{sim_type} in not a type or is not implemented yet")

        p2in = self.last_input_file.absolute()
        p2out = (self.output_dir / self.last_output_seed / self.last_output_seed).absolute()

        cmd = f"{self.app_cmd} -i {p2in} {params} -o {p2out}"
        _run(args=shlex.split(cmd), print_output=print_output)

    def get_last_output_files(self):
        """Returns the path to all output files from last simulation"""
        return [f for f in self.last_read_output_dir.iterdir()]

    def list_last_output_files(self):
        """Prints a list of the last simulation's output files"""
        for f in self.get_last_output_files():
            print(f.name)

    def list_all_input_files(self):
        for f in sorted(list(self.input_dir.iterdir())):
            if f.suffix == '.fa': print(f.name)

    def get_all_output_files(self):
        """Return a dictionary with k as output file subdirectory and v as list of output files"""
        all_output_files = {}
        for d in sorted([p for p in self.output_dir.iterdir() if p.is_dir()]):
            files = []
            for f in d.iterdir():
                files.append(f)
            all_output_files[d.name] = files

        return all_output_files

    def list_all_output_files(self):
        all_files = self.get_all_output_files()
        for k, v in all_files.items():
            print(k)
            print('\n'.join([f"- {p.name}" for p in v]))
        
    def print_last_output_file_excerpts(
        self, 
        suffix: str='fq',        # suffix of the files to explore: 'fq', 'aln', 'sam' 
        nlines: int=12          # number of lines to print for each file
        ):
        """Print the first lines of all the latest file with given suffix"""

        for p in [p for p in self.get_last_output_files() if p.suffix == f".{suffix}"]:
            print(f"{'='*120}")
            print(f"File Name: {p.name}.")
            print(f"{'-'*80}")
            with open(p, 'r') as f:
                n, lines = 0, []
                while True:
                    n += 1
                    line = f.readline()
                    if line == '': break
                    elif n > nlines: break
                    else:
                        lines.append(line)

            print(''.join(lines))

#### Usage

1. Create an instance of `ArtIllumina`
2. Run a simulation
3. Export output files

Create an instance of `ArtIllumina` with:
- the path to the application on the local system
- the directories for input and output files (optional)

In [None]:
#|eval: false
p2art = Path('/bin/art_illumina')
assert p2art.exists()
p2data = Path('data_dev/ncbi/refsequences/cov')
assert p2data.exists()

In [None]:
#|eval: false
art = ArtIllumina(
    path2app=p2art,
    input_dir=p2data,
    )

Ready to operate with art: /bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov
Output files to :  /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov


In [None]:
nbdev.show_doc(ArtIllumina.sim_reads)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/art.py#L86){target="_blank" style="float:right; font-size:smaller"}

### ArtIllumina.sim_reads

>      ArtIllumina.sim_reads (input_file:str, output_seed:str,
>                             sim_type:str='single', read_length:int=150,
>                             fold:int=10, mean_read:int=None,
>                             std_read:int=None, ss:str='HS25',
>                             overwrite:bool=False, print_output:bool=True)

*Simulates reads with art_illumina. Output files saved in a separate directory*

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| input_file | str |  | name of the fasta file to use as input |
| output_seed | str |  | seed to use for the output files |
| sim_type | str | single | type of read simmulation: 'single' or 'paired' |
| read_length | int | 150 | length of the read in bp |
| fold | int | 10 | fold |
| mean_read | int | None | mean length of the read for paired reads |
| std_read | int | None | std of the read length, for paired reads |
| ss | str | HS25 | quality profile to use for simulation, |
| overwrite | bool | False | overwrite existing output files if true, raise error if false |
| print_output | bool | True | if True, prints art ilumina's CLI output |

#### Run a single read simulations

- Provide an input file and a seed for the names of the output files
- Prints out the log messages issued by `art_illumina`

In [None]:
#|eval: false
input_fname = 'cov_virus_sequence_one.fa'
output_seed = 'single_1seq_150bp'

art.sim_reads(
    input_file=input_fname,
    output_seed=output_seed,
    sim_type="single",
    read_length=150,
    fold=100,
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 0.436844

The random seed for the run: 1738405239

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov/single_1seq_150bp/single_1seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov/single_1seq_150bp/single_1seq_150bp.aln




Run a paired read simulations with the input file.

In [None]:
#|eval: false
art.print_last_output_file_excerpts()

File Name: single_1seq_150bp.fq.
--------------------------------------------------------------------------------
@2591237:ncbi:1-20100
GTACCACAGATGTGCACTTTACGTCAGACATTTTAGACTGTACAGTAGCAACCTTGATACATGGTTTACCTCCAATACCTAACAACTTAATGTTAAGCTTGAAAGCATCAATACTACTCTTAGGAGGCAAAAGCCCCTGGGAGTTCATAT
+
CCCGGGGGG1GGCGJJGJJGJJGJGJJJJJJJGJJ=GGJGGJJGJJGCCJGJGGGJGCGCC=GGJCGCGJGJGGCC=GGCGGGGGGGGG8GG=GG8GGCCJGCCCGGCCCGG=CGGGGCGGCGGGGGGGGGGGGGGCGGGGCCGGGGGCG
@2591237:ncbi:1-20099
TACACCCTTTGCCAGCTCGCTATGAGCTGTAGCAACGAGTACCTTAAGTTTTTCCATAGGAACACTAAAAGTTGCTGAAAAGGTGTCGACATAAGCATCAAACATCTTAACAGAAACTTCAGTACTATCTCCAACATCTGATACGAGAGC
+
=CCG=GGGGGGGGJJGJJJCGJGJJJJJJGGJJJJJJGJJJJCJJGJCGGGJGGGGJGJJG(J=JGGGCG=G=CGGGGG=GGGCG8GGGGGGGC8C=GGCJ8G=CGGGGGGGGG=GGGG=1G8G==GCGGGGGCGGGGGGGGCCGCCCGC
@2591237:ncbi:1-20098
ATGTCCTGCCTGTCAAGACCCAGAGATTGGACCTGAGCATAGTGTTGCAGATTATCACAACCACTCAAACATTGAAACTCGACTCCGCAAGGGAGGTAGGACTAGATGTTTTGGAGGCTGTGTGTTTGCCTATGTCGGCTGCTATAACAA
+
CCC1CGG1GGGGGJJJJCGJJ1JGJGJGJJGJGGJGJGGJJGJGJGJJGCGJCJ=JJGC

In [None]:
#|eval: false
input_fname = 'cov_virus_sequence_one.fa'

art.sim_reads(
    input_file=input_fname,
    output_seed='paired_1seq_150bp',
    sim_type="paired",
    read_length=150,
    fold=100,
    mean_read=200,
    std_read=10,
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 0.437712

The random seed for the run: 1738405243

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov/paired_1seq_150bp/paired_1seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/ncbi/refsequences/cov/paired_1s

In [None]:
#|eval: false
art.print_last_output_file_excerpts()

File Name: paired_1seq_150bp2.fq.
--------------------------------------------------------------------------------
@2591237:ncbi:1-20100/2
TTATAGCAGCCGACATAGGCAAACACACAGCCTCCAAAACATCTAGTCCTACCTCCCTTGCGGAGTCGAGTTTCAATGTTTGAGTGGTTGTGATAATCTGCAACACTATGCTCAGGTCCAATCTCTGGGTCTTGACAGGCAGGACATGGC
+
CCCGGGGGCGGGGGJJJJJGJJJJG8JJ=GJJCGGJCJ1GGCJJGCGGJJJJGGCJGGCGJJ=JCGGG=GGG(C=GCCGC=GGGGCGCGGGGGGGGGG=GCGCCJJJCGGGGCCGCGGCCGGCCGGGCC8CGGGCGC=GGGCGCGCCCCC
@2591237:ncbi:1-20098/2
ATCATTACCGGTCTTCATCCAACACAGGCACCTACACACCTCAGCGTTGACACAAAATTTAAGACTGAGGGACTATGTGTTGACATACCAGGCATACCAAAGGACATGACCTACCGTAGACTCATCTCTATGATGGGTTTTAAAATGAAT
+
=CCGG=G1GGGGGCGJJGGJJJJ8JJJJ=JCGJJGJJGJJJJCJJGGGJJGGGGG=CJGCGGGCCJJ8CG8J=CGGGGCCGGGGGCGCCCGGGGGCGGCG=CJCJCJ=C=GGGCGGCGGGG=CGGGGGGCGGGCGGGGGGGCGCGCGGGC
@2591237:ncbi:1-20096/2
CGGTACTAGACATACCTATCAGCTTCGTGCAAGATCAGTTTCACCAAAACTTTTCATCAGACAAGAGGAAGTTCACCAAGAGCTCTACTCACCGCTTTTTCTCATTGTTGCTGCTCTAGTATTTATAATACTTTGCTTCACCATTAAGAG
+
CCCGGGGGGG=GC1JJJJJJJJ1JJJCGCJJJGCJGJCG(JGGJGJGJGJGG

In [None]:
#|eval: false
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln


In [None]:
#| hide
nbdev.nbdev_export()