# Handle covid sequence fasta files

Notebook objective:
- Load fasta file with reference sequences
- Create an instance of ArtIllumina to simulate reads
- Create read files (fatsq and aln) with several options

# Imports and setup environment

### Install and import packages

In [None]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [None]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from ecutilities.core import files_in_tree
from ecutilities.ipython import nb_setup
from metagentools.art import ArtIllumina
from metagentools.core import ProjectFileSystem
from metagentools.cnn_virus.data import FastaFileReader
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

Set autoreload mode


## Setup project file system

In [None]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(Path('/home/vtec'),
 Path('/home/vtec/projects/bio/metagentools'),
 Path('/home/vtec/projects/bio/metagentools/data'),
 Path('/home/vtec/projects/bio/metagentools/nbs'))

# Explore reference file directory with fasta files

Define path to directory including all reference sequence fasta files

In [None]:
p2inputs = pfs.data/ 'ncbi/refsequences'
print(p2inputs.absolute())
assert p2inputs.is_dir()

/home/vtec/projects/bio/metagentools/data/ncbi/refsequences


Explore files in the directory:

In [None]:
files = files_in_tree(p2inputs, pattern='sequence')

ncbi
  |--refsequences
  |    |--cov_virus_sequence_001-seq1.fa (0)
  |    |--cov_virus_sequences.txt (1)
  |    |--cov_virus_sequences-original.txt (2)
  |    |--cov_virus_sequences_100-seqs.fa (3)
  |    |--cov_virus_sequences_002-seqs.fa (4)
  |    |--cov_virus_sequences_025-seqs.fa (5)
  |    |--cov_virus_sequences_010-seqs.fa (6)
  |    |--cov_virus_sequence_001-seq2.fa (7)
  |    |--cov_virus_original_cnn_sequences.json (8)
  |    |--cov_virus_sequences.fa (9)
  |    |--groups_1


Pick the fasta file with 100 sequences (file with index 3) and read it

In [None]:
n = 3
files[n].name

'cov_virus_sequences_100-seqs.fa'

Create instance of FastaFileReader pointing to the selected file

In [None]:
fasta = FastaFileReader(files[n])

Explore one sequence

In [None]:
seq_1 = next(fasta)
seq_1

{'definition line': '>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] 2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific name',
 'sequence': 'TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTAGCTGTCGCTCGGCTGCATGCCTAGTGCACCTACGCAGTATAAACAATAATAAATTTTACTGTCGTTGACAAGAAACGAGTAACTCGTCCCTCTTCTGCAGACTGCTTACGGTTCCGTCCGTGTTGCAGTCGATCATCAGCATACCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTTCTTGGTGTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTCTTCAGGTTAGAGACGTGCTAGTGCGTGGCTTCGGGGACTCTGTGGAAGAGGCCCTATCGGAGGCACGTGAACATCTTAAAAATGGCACTTGTGGTTTAGTAGAGCTGGAAAAAGGCGTACTGCCCCAGCTTGAACAGCCCTATGTGTTCATTAAACGTTCTGATGCCTTAAGCACCAATCACGGCCACAAGGTCGTTGAGCTGGTTGCAGAATTGGACGGCATTCAGTACGGTCGTAGCGGTATAACTCTGGGAGTACTCGTGCCACATGTGGGCGAAACCCCAATCGCATACCGCAATGTTCTTCTTCGTAAGAACGGTAATAAGGGAGCCGGTGGCCATAGCTTTGGCATCGATCTAAAGTCTTATGACTTAGGTGACGAGCTTGGTACTGATCCCATTGAAGATTATGAACAAAACTGGAACACTAAGCATGGCAGTGGTGTACTCCGTGAACTCACTCGTGAGCTCAATGGAGGTGCAGTCACTCGCTATGTCGACAACAACTTCTGTGGCCCAGAT

In [None]:
seq_1['definition line']

'>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] 2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific name'

FASTA file structure for the `cov_data` files:

- `2591237` is the NCBI taxonomy ID a.k.a. `NCBI:txid2591237`. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=2591237)
- `ncbi` is the source of info
- `1` is the sequence number in the file
- `MK211378` is the GenBank accession number for the sequence. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/nuccore/MK211378)
- `Coronavirus BtRs-BetaCoV/YN2018D` is the species

`FastaFileReader.parse_text()` uses regex predefined patterns to parse/extract each piece of metadata from the definition line.

In [None]:
fasta.parse_text(seq_1['definition line'])

{'accession': 'MK211378',
 'seqid': '2591237:ncbi:1',
 'seqnb': '1',
 'source': 'ncbi',
 'species': 'Coronavirus BtRs-BetaCoV/YN2018D  scientific name',
 'taxonomyid': '2591237'}

# Create reads from fasta file with 10 sequences

In [None]:
nb_sequences = 10

## Single read simulation - 50 bp read

In [None]:
art = ArtIllumina(
    path2app=Path('/usr/bin/art_illumina'), 
    input_dir=p2inputs, 
    output_dir=pfs.data /'ncbi/simreads'
    )

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/data/ncbi/refsequences
Output files to :  /home/vtec/projects/bio/metagentools/data/ncbi/simreads


### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_virus_sequence_001-seq1.fa
cov_virus_sequence_001-seq2.fa
cov_virus_sequences.fa
cov_virus_sequences_002-seqs.fa
cov_virus_sequences_010-seqs.fa
cov_virus_sequences_025-seqs.fa
cov_virus_sequences_100-seqs.fa


Pick fasta file with 10 sequences

In [None]:
input_fname = 'cov_virus_sequences_010-seqs.fa'

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_50bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 8.01683

The random seed for the run: 1705653351

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/single_10seq_50bp/single_10seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/single_10seq_50bp/single_10seq_50bp.aln




In [None]:
art.list_last_output_files()

single_10seq_50bp.fq
single_10seq_50bp.aln


## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_150bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 3.89042

The random seed for the run: 1705653405

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/single_10seq_150bp/single_10seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/single_10seq_150bp/single_10seq_150bp.aln




In [None]:
art.list_last_output_files()

single_10seq_150bp.fq
single_10seq_150bp.aln


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'paired',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_50bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 8.1579

The random seed for the run: 1705653428

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 126 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/paired_10seq_50bp/paired_10seq_50bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/paired_10seq_50bp/paired_10seq_50bp2.fq

  ALN Alignm

In [None]:
art.list_last_output_files()

paired_10seq_50bp2.aln
paired_10seq_50bp1.fq
paired_10seq_50bp1.aln
paired_10seq_50bp2.fq


List all the output files present in `simreads`

In [None]:
art.list_all_output_files()

paired_100seq_150bp
- paired_100seq_150bp1.aln
- paired_100seq_150bp2.aln
- paired_100seq_150bp1.fq
- paired_100seq_150bp2.fq
paired_100seq_50bp
- paired_100seq_50bp2.aln
- paired_100seq_50bp1.aln
- paired_100seq_50bp2.fq
- paired_100seq_50bp1.fq
paired_10seq_150bp
- paired_10seq_150bp1.aln
- paired_10seq_150bp2.fq
- paired_10seq_150bp2.aln
- paired_10seq_150bp1.fq
paired_10seq_50bp
- paired_10seq_50bp2.aln
- paired_10seq_50bp1.fq
- paired_10seq_50bp1.aln
- paired_10seq_50bp2.fq
paired_25seq_150bp
- paired_25seq_150bp2.fq
- paired_25seq_150bp1.aln
- paired_25seq_150bp2.aln
- paired_25seq_150bp1.fq
paired_25seq_50bp
- paired_25seq_50bp1.fq
- paired_25seq_50bp2.aln
- paired_25seq_50bp2.fq
- paired_25seq_50bp1.aln
single_100seq_150bp
- single_100seq_150bp.fq
- single_100seq_150bp.aln
single_100seq_50bp
- single_100seq_50bp.fq
- single_100seq_50bp.aln
single_10seq_150bp
- single_10seq_150bp.fq
- single_10seq_150bp.aln
single_10seq_50bp
- single_10seq_50bp.fq
- single_10seq_50bp.aln
single_

## Paired read simulation - 150 bp read


### Prepare simulated read files

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_150bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 4.21687

The random seed for the run: 1705653489

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/paired_10seq_150bp/paired_10seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/paired_10seq_150bp/paired_10seq_150bp2.fq

  ALN

In [None]:
art.list_last_output_files()

paired_10seq_150bp1.aln
paired_10seq_150bp2.fq
paired_10seq_150bp2.aln
paired_10seq_150bp1.fq
