# Art Simulated Reads

Notebook objective:
- create sets of simulation reads with 10 sequences
    - single type with length 50bp and 150 bp
    - paired type with length 50bp and 150 bp
    
- load the created simulated fa and aln files as artifacts onto WandB

> This notebook must be run locally on a computer with ART ILLIMINA installed

# Setup


In [None]:
from ecutils.ipython import nb_setup
nb_setup()

Set autoreload mode


In [None]:
from metagentools.art import ArtIllumina
from metagentools import wandb
from pathlib import Path

## Login and create WandB run

In [None]:
nb_fname = '01_load_art_sim_reads_onto_wandb.ipynb'
wandb.login_nb(nb_fname)

Logging in from notebook: /home/vtec/projects/bio/metagentools/nbs/art/01_load_art_sim_reads_onto_wandb.ipynb


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
entity = 'metagenomics_sh'

In [None]:
projs = wandb.entity_projects(entity)
for i, p in enumerate(projs):
    print(f"{i}: {p.name}")

0: reproduce_cnn_virus


In [None]:
project = 'reproduce_cnn_virus'

run_name = wandb.unique_run_name('load_cov_sim_reads_10seq')
run_name

'load_cov_sim_reads_10seq-230125-2236'

In [None]:
job_type='load_datasets'
notes='load simulated reads for cov, based on 10 sequences fasta file'

In [None]:
wandb_run = wandb.WandbRun(
    entity=entity, 
    project=project, 
    run_name=run_name, 
    job_type=job_type, 
    notes=notes
    )

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667030833292908, max=1.0)…

## ArtIllumina instance

Create an instance of `ArtIllumina` with the path to the application on the system, and the directories for input and output files

In [None]:
p2art = Path('/bin/art_illumina')
assert p2art.is_file()

p2inputs = Path('../../data/cov_data').resolve()
print(p2inputs.absolute())
assert p2inputs.is_dir()

p2outputs = Path('../../data/cov_simreads').resolve()
print(p2outputs.absolute())
assert p2outputs.is_dir()

/home/vtec/projects/bio/metagentools/data/cov_data
/home/vtec/projects/bio/metagentools/data/cov_simreads


In [None]:
art = ArtIllumina(
    path2app=p2art,
    input_dir=p2inputs,
    output_dir=p2outputs
    )

Ready to operate with art: /bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/data/cov_data
Output files to :  /home/vtec/projects/bio/metagentools/data/cov_simreads


# Using 10 sequences


## Single read simulation - 50 bp read

### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_virus_list.txt
cov_virus_sequence_one_1.fa
cov_virus_sequence_one_2.fa
cov_virus_sequences.fa
cov_virus_sequences_ten.fa
cov_virus_sequences_two.fa


In [None]:
input_fname = 'cov_virus_sequences_ten.fa'

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': 10,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_50bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 6.44633

The random seed for the run: 1674657570

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp.aln




In [None]:
art.list_last_output_files()

single_10seq_50bp.fq
single_10seq_50bp.aln


In [None]:
art.print_last_output_file_excerpts()

File Name: single_10seq_50bp.fq.
--------------------------------------------------------------------------------
@2591237:ncbi:1-60400
GTGGCCAGTAACACTTGCTTGCTTTGTGCTTGCTGCTGTTTACAGAATTA
+
BCCCC;GGGGGGGGGGGGGGGGGGGGGGGB1G=GGG1GGGFGG1GGGGGG
@2591237:ncbi:1-60399
TGTAGATGCCACATTGATCATCCAAATCCTAAAGGATTTTGTGACTTGAA
+
CCCBCB11GG0GGGGGGGGGGCGGGGGGG9GCGGGGGGGGGG=GGGEGGE
@2591237:ncbi:1-60398
GCTCCCTCAGTTGCGACCCATACGATGCCTTCTTTGTTAGCGCCGTAGGG
+
CCCCCFGGGGGGFG>G1GGGGGGGGGBGGGGGGGGGGGGGGGGGG1GGGG



In [None]:
art.print_last_output_file_excerpts(suffix='aln')

File Name: single_10seq_50bp.aln.
--------------------------------------------------------------------------------
##ART_Illumina	read_length	50
@CM	/bin/art_illumina -i /home/vtec/projects/bio/metagentools/data/cov_data/cov_virus_sequences_ten.fa -ss HS25 -l 50 -f 100 -o /home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp -rs 1674657570
@SQ	2591237:ncbi:1 [MK211378]	2591237	ncbi	1 [MK211378] 2591237	Coronavirus BtRs-BetaCoV/YN2018D		scientific name	30213
@SQ	11128:ncbi:2 [LC494191]	11128	ncbi	2 [LC494191] 11128	Bovine coronavirus		scientific name	30942
@SQ	31631:ncbi:3 [KY967361]	31631	ncbi	3 [KY967361] 31631	Human coronavirus OC43		scientific name	30661
@SQ	277944:ncbi:4 [LC654455]	277944	ncbi	4 [LC654455] 277944	Human coronavirus NL63		scientific name	27516
@SQ	11120:ncbi:5 [MN987231]	11120	ncbi	5 [MN987231] 11120	Infectious bronchitis virus		scientific name	27617
@SQ	28295:ncbi:6 [KU893866]	28295	ncbi	6 [KU893866] 28295	Porcine epidemic diarrh

### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']}covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp
cov_sim_reads_single_10seq_50bp
Simulated reads of length 50bp out of 10covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp)... Done. 1.3s


Dataset cov_sim_reads_single_10seq_50bp is being logged as artifact ...


## Single read simulation - 150 bp read

### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_virus_list.txt
cov_virus_sequence_one_1.fa
cov_virus_sequence_one_2.fa
cov_virus_sequences.fa
cov_virus_sequences_ten.fa
cov_virus_sequences_two.fa


In [None]:
input_fname = 'cov_virus_sequences_ten.fa'

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': 10,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_150bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 3.2806

The random seed for the run: 1674657712

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp/single_10seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp/single_10seq_150bp.aln




In [None]:
art.list_last_output_files()

single_10seq_150bp.fq
single_10seq_150bp.aln


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']}covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp
cov_sim_reads_single_10seq_150bp
Simulated reads of length 150bp out of 10 covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp)... Done. 0.2s


Dataset cov_sim_reads_single_10seq_150bp is being logged as artifact ...


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [None]:
art.list_all_input_files()

cov_virus_list.txt
cov_virus_sequence_one_1.fa
cov_virus_sequence_one_2.fa
cov_virus_sequences.fa
cov_virus_sequences_ten.fa
cov_virus_sequences_two.fa


In [None]:
input_fname = 'cov_virus_sequences_ten.fa'

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 50,
    'nb_sequences': 10,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'paired',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_50bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 7.49074

The random seed for the run: 1674657793

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 126 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp/paired_10seq_50bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp/paired_10seq_50bp2.fq

  ALN Alignme

In [None]:
art.list_last_output_files()

paired_10seq_50bp2.aln
paired_10seq_50bp1.fq
paired_10seq_50bp1.aln
paired_10seq_50bp2.fq


In [None]:
art.print_last_output_file_excerpts()

File Name: paired_10seq_50bp1.fq.
--------------------------------------------------------------------------------
@2591237:ncbi:1-60400/1
ATAGGGTTTGTGTTCCTCCAGAAAATGTAGTTAGCATGCATGGTATAGCC
+
C:BCCGGGGGGGGGEEGGGGGGG<GGGFGGGFGFGGGGD1GGGGFGG0GG
@2591237:ncbi:1-60398/1
TCTGTTATTGACCTCTTACTCGATGACTTTGTTGAGATAATAAAGTCACA
+
CC:ACEG;FGGGGGGGGGGGGGGGGGE>1GDGGEGGGGGGGG>GFGGGGG
@2591237:ncbi:1-60396/1
TACAGAAGTACCTGCCAATTCAACTGTGCTTTCCTTCTGTGCCTTTGCAG
+
CCCCAGGGGF1GGGGGGGGGGGG>GGGGGGGGGGGGF<GGFGGEGGGGGG

File Name: paired_10seq_50bp2.fq.
--------------------------------------------------------------------------------
@2591237:ncbi:1-60400/2
GATAACAGAGCATTCTTGGAATGCTGATCTTTACAAGCTTATGGGACATT
+
BCCCCGGGGG1G1GGGGGGGGGGGGG<GGGGGGGGG1FG1@GGCGG1FGG
@2591237:ncbi:1-60398/2
TAGGCATTGCAACCCCCGGTTGCCACGCTTGACTTGCTTGTAATTTTGGG
+
CBCCCGGGGGGGGGGGGDGGGGGGGGGGGGGGG/GDGD?C1GGGGGGGGG
@2591237:ncbi:1-60396/2
TACAGACAGCATGAAGCACCACCAAAGGACTCTTGGTCCATGTTAGCTTC
+
CBCBC;GGGGGGFG1GCFGGGGGDGGGGGG/GGGGGGEGGGG>DGDGGGG



### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']}covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp
cov_sim_reads_paired_10seq_50bp
Simulated reads (paired) of length 50bp out of 10covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp)... Done. 0.2s


Dataset cov_sim_reads_paired_10seq_50bp is being logged as artifact ...


## Paired read simulation - 150 bp read


### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_virus_list.txt
cov_virus_sequence_one_1.fa
cov_virus_sequence_one_2.fa
cov_virus_sequences.fa
cov_virus_sequences_ten.fa
cov_virus_sequences_two.fa


In [None]:
input_fname = 'cov_virus_sequences_ten.fa'

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': 10,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_150bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 3.39643

The random seed for the run: 1674657923

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp/paired_10seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp/paired_10seq_150bp2.fq

  ALN A

In [None]:
art.list_last_output_files()

paired_10seq_150bp1.aln
paired_10seq_150bp2.fq
paired_10seq_150bp2.aln
paired_10seq_150bp1.fq


In [None]:
art.list_all_output_files()

paired_10seq_150bp
- paired_10seq_150bp1.aln
- paired_10seq_150bp2.fq
- paired_10seq_150bp2.aln
- paired_10seq_150bp1.fq
paired_10seq_50bp
- paired_10seq_50bp2.aln
- paired_10seq_50bp1.fq
- paired_10seq_50bp1.aln
- paired_10seq_50bp2.fq
single_10seq_150bp
- single_10seq_150bp.fq
- single_10seq_150bp.aln
single_10seq_50bp
- single_10seq_50bp.fq
- single_10seq_50bp.aln


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']}covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp
cov_sim_reads_paired_10seq_150bp
Simulated reads (paired) of length 150bp out of 10covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp)... Done. 0.1s


Dataset cov_sim_reads_paired_10seq_150bp is being logged as artifact ...


## Close current WandB active run

In [None]:
wandb_run.finish()

# Others