# Handle covid sequence fasta files

Notebook objective:
- ...

# Imports and setup environment

### Install and import packages

In [1]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [6]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from ecutilities.core import files_in_tree, path_to_parent_dir
from ecutilities.ipython import nb_setup
from metagentools.art import ArtIllumina
from metagentools.core import ProjectFileSystem
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Set autoreload mode


## Setup project file system

In [7]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(Path('/home/vtec'),
 Path('/home/vtec/projects/bio/metagentools'),
 Path('/home/vtec/projects/bio/metagentools/data'),
 Path('/home/vtec/projects/bio/metagentools/nbs'))

## Login and create WandB run (Optional)

In [None]:
# from metagentools import wandb

In [None]:
# nb_fname = '00_load_cov_sequence_files.ipynb'
# wandb.login_nb(nb_fname)

In [None]:
# entity = 'metagenomics_sh'

In [None]:
# projs = wandb.entity_projects(entity)
# for i, p in enumerate(projs):
#     print(f"{i}: {p.name}")

In [None]:
# project = 'create_and_load_sim_reads'

# run_name = wandb.unique_run_name('seq_files')
# run_name

In [None]:
# job_type='load_datasets'
# notes='load covid sequence files (fasta)'

In [None]:
# wandb_run = wandb.WandbRun(
#     entity=entity, 
#     project=project, 
#     run_name=run_name, 
#     job_type=job_type, 
#     notes=notes
#     )

## Load `fa` file and parse data

Define path to files

In [8]:
p2inputs = pfs.data/ 'cov_data'
print(p2inputs.absolute())
assert p2inputs.is_dir()

/home/vtec/projects/bio/metagentools/data/cov_data


Explore files in the directory:

In [9]:
files = files_in_tree(p2inputs, pattern='sequence')

data
  |--cov_data
  |    |--cov_virus_sequences_hundred.fa (0)
  |    |--cov_virus_sequence_one_2.fa (1)
  |    |--cov_virus_sequences_two.fa (2)
  |    |--cov_virus_sequences_twenty_five.fa (3)
  |    |--cov_virus_sequences_ten.fa (4)
  |    |--cov_virus_original_cnn_sequences.json (5)
  |    |--cov_virus_sequence_one_1.fa (6)
  |    |--cov_virus_sequences.fa (7)
  |    |--groups_1


Pick fasta file with one sequence and read it

In [10]:
with open(files[6], 'r') as fp:
    first_lines = []
    while True:
        line = fp.readline()
        if line == '': break
        if line[0] == '>':
            first_lines.append(line)

display(''.join(first_lines[:10]))

'>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] 2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific name\n'

FASTA file structure for the `cov_data` files:

- `2591237` is the NCBI taxonomy ID a.k.a. `NCBI:txid2591237`. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=2591237)
- `ncbi` is the source of info
- `1` is the sequence number in the file
- `MK211378` is the GenBank accession number for the sequence. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/nuccore/MK211378)
- `Coronavirus BtRs-BetaCoV/YN2018D` is the species

### Parse sequence metadata

In [11]:
fasta_seq_metadata_pattern = r"^>(?P<txid>\d*):ncbi:\d*(\s*|\t)\[(?P<accession>\w*\d*)\](\s*\n|((\s*|\t)(?P=txid)(\s*|\t)ncbi(\s*|\t)\d*(\s*|\t)\[(?P=accession)\](\s*|\t)(?P=txid)(\s*|\t)(?P<species>.*)(\s*|\t)scientific name\n))"

In [12]:
seq_metadata = {}
for i, line in enumerate(first_lines):
    subdict = {}
    matches = re.match(fasta_seq_metadata_pattern, line)
    if matches is not None:
        for g in 'txid accession species'.split(' '):
    #         print(f"{g:15s} {matches.group(g)}")
            subdict[g] = matches.group(g)
    #     print()
        seq_metadata[subdict['accession']] = subdict
    else:
        print(i, line)

In [14]:
seq_metadata['MK211378']

{'txid': '2591237',
 'accession': 'MK211378',
 'species': 'Coronavirus BtRs-BetaCoV/YN2018D\t\t'}

In [None]:
def parse_seq_metadata(line):
    
    pattern = r"^>(?P<txid>\d*):ncbi:\d*(\s*|\t)\[(?P<accession>\w*\d*)\](\s*\n|((\s*|\t)(?P=txid)(\s*|\t)ncbi(\s*|\t)\d*(\s*|\t)\[(?P=accession)\](\s*|\t)(?P=txid)(\s*|\t)(?P<species>.*)(\s*|\t)scientific name\n))"
    matches = re.match(pattern, line)
    metadata = {}
    if matches is not None:
        for g in 'txid accession species'.split(' '):
            m = matches.group(g)
            metadata[g] = m.replace('\t', ' ').strip() if m is not None else None
        return metadata
    else:
        raise ValueError(f"No match on this line")
        
parse_seq_metadata(first_lines[72])

{'txid': '2877474', 'accession': 'MZ328303', 'species': None}

# Using 10 sequences


In [None]:
nb_sequences = 10

## Single read simulation - 50 bp read

### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_virus_list.txt
cov_virus_sequence_one_1.fa
cov_virus_sequence_one_2.fa
cov_virus_sequences.fa
cov_virus_sequences_ten.fa
cov_virus_sequences_twenty_five.fa
cov_virus_sequences_two.fa


In [None]:
input_fname = 'cov_virus_sequences_ten.fa'

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_50bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 6.32933

The random seed for the run: 1674660835

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp/single_10seq_50bp.aln




In [None]:
art.list_last_output_files()

single_10seq_50bp.fq
single_10seq_50bp.aln


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp
cov_sim_reads_single_10seq_50bp
Simulated reads (single) of length 50bp out of 10 covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_50bp)... Done. 0.2s


Dataset cov_sim_reads_single_10seq_50bp is being logged as artifact ...


## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_150bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 3.30155

The random seed for the run: 1674660897

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp/single_10seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp/single_10seq_150bp.aln




In [None]:
art.list_last_output_files()

single_10seq_150bp.fq
single_10seq_150bp.aln


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp
cov_sim_reads_single_10seq_150bp
Simulated reads (single) of length 150bp out of 10 covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/single_10seq_150bp)... Done. 0.2s


Dataset cov_sim_reads_single_10seq_150bp is being logged as artifact ...


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'paired',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_50bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 6.32218

The random seed for the run: 1674660947

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 126 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp/paired_10seq_50bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp/paired_10seq_50bp2.fq

  ALN Alignme

In [None]:
art.list_last_output_files()

paired_10seq_50bp2.aln
paired_10seq_50bp1.fq
paired_10seq_50bp1.aln
paired_10seq_50bp2.fq


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp
cov_sim_reads_paired_10seq_50bp
Simulated reads (paired) of length 50bp out of 10 covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_50bp)... Done. 0.2s


Dataset cov_sim_reads_paired_10seq_50bp is being logged as artifact ...


## Paired read simulation - 150 bp read


### Prepare simulated read files

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_ten.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_150bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 3.4763

The random seed for the run: 1674660994

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp/paired_10seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp/paired_10seq_150bp2.fq

  ALN Al

In [None]:
art.list_last_output_files()

paired_10seq_150bp1.aln
paired_10seq_150bp2.fq
paired_10seq_150bp2.aln
paired_10seq_150bp1.fq


### Create dataset artifact and log it

In [None]:
ds_dir = art.output_dir / art.last_output_seed
print(ds_dir)
assert ds_dir.is_dir()

ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
ds_type = 'sim_reads'
ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

print(ds_name)
print(ds_descr)

/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp
cov_sim_reads_paired_10seq_150bp
Simulated reads (paired) of length 150bp out of 10 covid sequences


In [None]:
afx = wandb_run.upload_dataset(
    ds_path=ds_dir,
    ds_name=ds_name, 
    ds_type=ds_type,
    ds_descr=ds_descr,
    ds_metadata=sim_params,
    load_type='dir'
)

[34m[1mwandb[0m: Adding directory to artifact (/home/vtec/projects/bio/metagentools/data/cov_simreads/paired_10seq_150bp)... Done. 0.1s


Dataset cov_sim_reads_paired_10seq_150bp is being logged as artifact ...


## Close current WandB active run

In [None]:
wandb_run.finish()

# Others