In [2]:
#|default_exp art

In [3]:
#|hide
import nbdev

from pathlib import Path
from __future__ import annotations
from fastcore.test import *
from nbdev.showdoc import *
from pdb import set_trace

# Todo: move the google code in utils module
try:
    from google.colab import drive
    ON_COLAB = True
    print('Running on colab')
    print('Installing wandb and project code')
    !pip install -U git+https://github.com/vtecftwy/metagenomics.git@refactor_cnn_virus
    !pip install -qqU wandb
    
    # Assumes shared gdrive dir accessible through shortcut `Metagenomics` under the root of gdrive.     
    drive.mount('/content/gdrive')
    p2drive = Path('/content/gdrive/MyDrive/Metagenonics')
    p2data =  p2drive / 'CNN_Virus_data'
    assert p2drive.is_dir()
    assert p2data.is_dir()

except ModuleNotFoundError:
    ON_COLAB = False
    print('Running locally')
    try:
        import metagentools
    except ModuleNotFoundError:
        raise ModuleNotFoundError('Cannot find package metagentools. Make sure you pip -e install it in your environment')
    p2data = Path('../data/CNN_Virus_data').resolve()
    assert p2data.is_dir()

%load_ext autoreload
%autoreload 2

Running locally


In [4]:
#|export
# Imports all dependencies
import os
import subprocess
import shlex
import sys

from pathlib import Path
from typing import Tuple, List, Optional

# art

> Use **ART**, next gen read simmulation tool, from python notebook

`ART` is an open source package simmulation next generation read of genomes, available on the website of the National Institute of Environmental Health Sciences [here](https://www.niehs.nih.gov/research/resources/software/biostatistics/art/index.cfm). It is a command line interface package. This module makes the package accessible from a jupyter notebook

**Typical usage**

- read simulation with paired reads:
    - `art_illumina -ss HS25 -sam -i file.fa -p -l 150 -f 20 -m 200 -s 10 -o paired_seq_1`
- read simulation  with single reads:
    - `art_illumina -ss HS25 -sam -i file.fa -l 150 -f 10 -o single_seq_1`

Where the parameters are:

```ASCII
  -f   --fcov   the fold of read coverage to be simulated or number of reads/read pairs generated for each amplicon
  -i   --in     the filename of input DNA/RNA reference
  -l   --len    the length of reads to be simulated
  -m   --mflen  the mean size of DNA/RNA fragments for paired-end simulations
  -o   --out    the prefix of output filename
  -p   --paired indicate a paired-end read simulation or to generate reads from both ends of amplicons
                NOTE: art will automatically switch to a mate-pair simulation if the given mean fragment size >= 2000
  -s   --sdev   the standard deviation of DNA/RNA fragment size for paired-end simulations.
  -sam --samout indicate to generate SAM alignment file
  -ss  --seqSys The name of Illumina sequencing system of the built-in profile used for simulation
                NOTE: sequencing system ID names are:
                GA1 - GenomeAnalyzer I (36bp,44bp), GA2 - GenomeAnalyzer II (50bp, 75bp)
                HS10 - HiSeq 1000 (100bp),          HS20 - HiSeq 2000 (100bp),      HS25 - HiSeq 2500 (125bp, 150bp)
                HSXn - HiSeqX PCR free (150bp),     HSXt - HiSeqX TruSeq (150bp),   MinS - MiniSeq TruSeq (50bp)
                MSv1 - MiSeq v1 (250bp),            MSv3 - MiSeq v3 (250bp),        NS50 - NextSeq500 v2 (75bp)
```
Notes:

- For single-end simulation, ART requires input sequence file, output file prefix, read length, and read count/fold coverage.
- For paired-end simulation (except for amplicon sequencing), ART also requires the parameter values of the mean and standard deviation of DNA/RNA fragment lengths

https://docs.python.org/3/library/subprocess.html

In [5]:
#| export
def _run(args: List[str], shell: bool=False):
    """Wrapper subprocess.run and prints the output"""
    p = subprocess.run(args=args, stdout=subprocess.PIPE, shell=shell)
    print('return code: ',p.returncode, '\n')
    print(str(p.stdout, 'utf-8'))

In [7]:
#| export
def _validate_path(p:str|Path) -> Path:
    """checks that path is a string or a Path, and returns a Path"""
    if isinstance(p, str): 
        p = Path(p)
    elif not isinstance(p, Path): 
        raise TypeError(f"a path must be a string or a Path, not a {type(p)}")
    return p

In [8]:
#| export
class ArtIllumina:
    """Simulate sequence reads with art_illumina"""

    def __init__(
        self, 
        path2app: str|Path,           # path to the art_illumina application on the system
        input_dir: str|Path,          # path to the dir where input files are
        output_dir: str|Path=None     # path to the dir where to save output files, if different from input_dir
        ):
        """Initialize the art_illumina instance"""

        # Validate and save paths
        path2app = _validate_path(path2app)        
        if path2app.is_file():
            self.app = path2app
        else:
            raise ValueError(f"{path2app.name} is not a file, please check the path to the application")

        input_dir = _validate_path(input_dir)
        if input_dir.is_dir():
            self.input_dir = input_dir
        else:
            raise ValueError(f"{input_dir.name} is not a directory, please check the path")

        if output_dir is None: 
            self.output_dir = input_dir
        else:
            output_dir = _validate_path(output_dir)
            if output_dir.is_dir():
                self.output_dir = output_dir
            else:
                raise ValueError(f"{input_dir.name} is not a directory, please check the path")

        print(f"Ready to operate with art: {self.app.absolute()}")
        print(f"Input files from : {self.input_dir.absolute()}")
        print(f"Output files to :  {self.output_dir.absolute()}")

    def sim_reads(
        self,
        input_file: str,     # name of the fasta file to use as input
        output_seed: str,    # seed to use for the output files
        sim_type='paired',   # type of read simmulation: 'single' or 'paired'
        read_length=150,     # length of the read in bp
        fold=10,             # fold
        mean_read=None,      # mean length of the read for paired reads
        std_read=None,       # std of the read length, for paired reads
        ss='HS25',           # quality profile to use for simulation
        ):
        """Simulates reads with art_illumina"""
        
        self.last_input_file = self.input_dir / input_file
        p2in = (self.last_input_file).absolute()
        self.last_ouput_seed = output_seed
        p2out = (self.output_dir / output_seed).absolute()

        if sim_type == 'single':
            params = f"-ss {ss} -sam -l {read_length} -f {fold}"
        elif sim_type == 'paired':
            if mean_read is None or std_read is None:
                raise ValueError(f"mean_read and std_read are required for a paired reads simulation")
            else:
                params = f"-ss {ss} -sam -p -l {read_length} -f {fold} -m {mean_read} -s {std_read}"
        else:
            raise RuntimeError(f"{sim_type} in not a type or is not implemented yet")

        cmd = f"{self.app.absolute()} -i {p2in} {params} -o {p2out}"

        _run(args=shlex.split(cmd))

Create an instance of ArtIllumina with the path to the application on the system, and the directories for input and output files

In [9]:
p2art = Path('/bin/art_illumina')
assert p2art.is_file()

p2data = Path('data_dev')
input_fname = 'cov_virus_sequence_1.fa'
assert (p2data/input_fname).is_file()

In [10]:
art = ArtIllumina(
    path2app=p2art,
    input_dir=p2data,
    )

Ready to operate with art: /bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs-dev/data_dev
Output files to :  /home/vtec/projects/bio/metagentools/nbs-dev/data_dev


In [16]:
#| echo: false
nbdev.show_doc(ArtIllumina.sim_reads)

---

[source](https://github.com/vtecftwy/metagentools/blob/main/metagentools/art.py#L70){target="_blank" style="float:right; font-size:smaller"}

### ArtIllumina.sim_reads

>      ArtIllumina.sim_reads (input_file:str, output_seed:str,
>                             sim_type='paired', read_length=150, fold=10,
>                             mean_read=None, std_read=None, ss='HS25')

Simulates reads with art_illumina

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| input_file | str |  | name of the fasta file to use as input |
| output_seed | str |  | seed to use for the output files |
| sim_type | str | paired | type of read simmulation: 'single' or 'paired' |
| read_length | int | 150 | length of the read in bp |
| fold | int | 10 | fold |
| mean_read | NoneType | None | mean length of the read for paired reads |
| std_read | NoneType | None | std of the read length, for paired reads |
| ss | str | HS25 | quality profile to use for simulation |

#### Run a read simulation

Run a single read simulations with the input file.

In [11]:
art.sim_reads(
    input_file=input_fname,
    output_seed='single_1_',
    sim_type="single",
    read_length=150,
    fold=20,
)   
    

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 0.244098

The random seed for the run: 1669285217

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            20X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1_.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1_.aln

  SAM Alignment File:
	/home/vtec/projects/bio/metagentools/nbs-dev/data_dev/single_1_.sam




Run a paired read simulations with the input file.

In [12]:
art.sim_reads(
    input_file=input_fname,
    output_seed='paired_seq_1_',
    sim_type="paired",
    read_length=150,
    fold=20,
    mean_read=200,
    std_read=10
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 0.26764

The random seed for the run: 1669285250

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            20X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/paired_seq_1_1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/nbs-dev/data_dev/paired_seq_1_2.fq

  ALN Alignment Files:
	 the 1st reads: /home/vtec/project

In [13]:
#| hide
nbdev.nbdev_export()