# Handle covid sequence fasta files

Notebook objective:
- ...

# Imports and setup environment

### Install and import packages

In [1]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [2]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from ecutilities.core import files_in_tree, path_to_parent_dir
from ecutilities.ipython import nb_setup
from metagentools.art import ArtIllumina
from metagentools.core import ProjectFileSystem
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

Set autoreload mode


## Setup project file system

In [3]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(Path('/home/vtec'),
 Path('/home/vtec/projects/bio/metagentools'),
 Path('/home/vtec/projects/bio/metagentools/data'),
 Path('/home/vtec/projects/bio/metagentools/nbs'))

## Login and create WandB run (Optional)

In [4]:
# from metagentools import wandb

In [5]:
# nb_fname = '00_load_cov_sequence_files.ipynb'
# wandb.login_nb(nb_fname)

In [6]:
# entity = 'metagenomics_sh'

In [7]:
# projs = wandb.entity_projects(entity)
# for i, p in enumerate(projs):
#     print(f"{i}: {p.name}")

In [8]:
# project = 'create_and_load_sim_reads'

# run_name = wandb.unique_run_name('seq_files')
# run_name

In [9]:
# job_type='load_datasets'
# notes='load covid sequence files (fasta)'

In [10]:
# wandb_run = wandb.WandbRun(
#     entity=entity, 
#     project=project, 
#     run_name=run_name, 
#     job_type=job_type, 
#     notes=notes
#     )

## Load `fa` file and parse data

Define path to files

In [11]:
p2inputs = pfs.data/ 'ncbi/refsequences'
print(p2inputs.absolute())
assert p2inputs.is_dir()

/home/vtec/projects/bio/metagentools/data/ncbi/refsequences


Explore files in the directory:

In [12]:
files = files_in_tree(p2inputs, pattern='sequence')

ncbi
  |--refsequences
  |    |--cov_virus_sequence_001-seq1.fa (0)
  |    |--cov_virus_sequences.txt (1)
  |    |--cov_virus_sequences-original.txt (2)
  |    |--cov_virus_sequences_100-seqs.fa (3)
  |    |--cov_virus_sequences_002-seqs.fa (4)
  |    |--cov_virus_sequences_025-seqs.fa (5)
  |    |--cov_virus_sequences_010-seqs.fa (6)
  |    |--cov_virus_sequence_001-seq2.fa (7)
  |    |--cov_virus_original_cnn_sequences.json (8)
  |    |--cov_virus_sequences.fa (9)
  |    |--groups_1


Pick fasta file with 100 sequences and read it

In [13]:
n = 3
files[n].name

'cov_virus_sequences_100-seqs.fa'

In [14]:
with open(files[n], 'r') as fp:
    first_lines = []
    while True:
        line = fp.readline()
        if line == '': break
        if line[0] == '>':
            first_lines.append(line)

display(''.join(first_lines[:10]))

'>2591237:ncbi:1 [MK211378]\t2591237\tncbi\t1 [MK211378] 2591237\tCoronavirus BtRs-BetaCoV/YN2018D\t\tscientific name\n>11128:ncbi:2 [LC494191]\t11128\tncbi\t2 [LC494191] 11128\tBovine coronavirus\t\tscientific name\n>31631:ncbi:3 [KY967361]\t31631\tncbi\t3 [KY967361] 31631\tHuman coronavirus OC43\t\tscientific name\n>277944:ncbi:4 [LC654455]\t277944\tncbi\t4 [LC654455] 277944\tHuman coronavirus NL63\t\tscientific name\n>11120:ncbi:5 [MN987231]\t11120\tncbi\t5 [MN987231] 11120\tInfectious bronchitis virus\t\tscientific name\n>28295:ncbi:6 [KU893866]\t28295\tncbi\t6 [KU893866] 28295\tPorcine epidemic diarrhea virus\t\tscientific name\n>28295:ncbi:7 [KJ645638]\t28295\tncbi\t7 [KJ645638] 28295\tPorcine epidemic diarrhea virus\t\tscientific name\n>28295:ncbi:8 [KJ645678]\t28295\tncbi\t8 [KJ645678] 28295\tPorcine epidemic diarrhea virus\t\tscientific name\n>28295:ncbi:9 [KR873434]\t28295\tncbi\t9 [KR873434] 28295\tPorcine epidemic diarrhea virus\t\tscientific name\n>1699095:ncbi:10 [KT36890

FASTA file structure for the `cov_data` files:

- `2591237` is the NCBI taxonomy ID a.k.a. `NCBI:txid2591237`. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=2591237)
- `ncbi` is the source of info
- `1` is the sequence number in the file
- `MK211378` is the GenBank accession number for the sequence. See reference [on NCBI site](https://www.ncbi.nlm.nih.gov/nuccore/MK211378)
- `Coronavirus BtRs-BetaCoV/YN2018D` is the species

### Parse sequence metadata

Using regex to extract metadata from this definition line

In [15]:
fasta_seq_metadata_pattern = r"^>(?P<txid>\d*):ncbi:\d*(\s*|\t)\[(?P<accession>\w*\d*)\](\s*\n|((\s*|\t)(?P=txid)(\s*|\t)ncbi(\s*|\t)\d*(\s*|\t)\[(?P=accession)\](\s*|\t)(?P=txid)(\s*|\t)(?P<species>.*)(\s*|\t)scientific name\n))"

In [16]:
seq_metadata = {}
for i, line in enumerate(first_lines):
    subdict = {}
    matches = re.match(fasta_seq_metadata_pattern, line)
    if matches is not None:
        for g in 'txid accession species'.split(' '):
            subdict[g] = matches.group(g)
        seq_metadata[subdict['accession']] = subdict
    else:
        print(i, line)

In [17]:
seq_metadata.keys()

dict_keys(['MK211378', 'LC494191', 'KY967361', 'LC654455', 'MN987231', 'KU893866', 'KJ645638', 'KJ645678', 'KR873434', 'KT368904', 'KT779556', 'KT374055', 'AF208066', 'KF961222', 'MF598676', 'KY581695', 'MW792514', 'MN477899', 'EU526388', 'JQ900257', 'LC654446', 'MG987420', 'KJ645699', 'KT368912', 'KF186564', 'KF192507', 'KU847996', 'KX219795', 'MH006962', 'LT905451', 'MW351626', 'LQ068527', 'MG546690', 'MH726387', 'KF923892', 'LC063846', 'KT336560', 'KU981060', 'KJ425512', 'KJ425487', 'JN129834', 'KJ128295', 'MW086531', 'KX185057', 'MF598677', 'MG837133', 'MN655002', 'JQ900256', 'LC654445', 'MG600030', 'MT576585', 'KP118894', 'KJ645654', 'LC063847', 'KT253272', 'OK500305', 'KX259256', 'MF598717', 'MC425624', 'MH539771', 'MK140814', 'MK071638', 'LC494181', 'MN594506', 'KJ650098', 'KP728470', 'KT368831', 'KR265862', 'KT323979', 'KJ425506', 'KU982975', 'KU886219', 'MZ328303', 'AF391542', 'FJ938054', 'JF330898', 'JF828980', 'KT326819', 'KX722530', 'MG977451', 'MA347651', 'MH061342', 'KJ42

In [18]:
seq_metadata['KY967361']

{'txid': '31631',
 'accession': 'KY967361',
 'species': 'Human coronavirus OC43\t\t'}

In [19]:
def parse_seq_metadata(line):
    
    pattern = r"^>(?P<txid>\d*):ncbi:\d*(\s*|\t)\[(?P<accession>\w*\d*)\](\s*\n|((\s*|\t)(?P=txid)(\s*|\t)ncbi(\s*|\t)\d*(\s*|\t)\[(?P=accession)\](\s*|\t)(?P=txid)(\s*|\t)(?P<species>.*)(\s*|\t)scientific name\n))"
    matches = re.match(pattern, line)
    metadata = {}
    if matches is not None:
        for g in 'txid accession species'.split(' '):
            m = matches.group(g)
            metadata[g] = m.replace('\t', ' ').strip() if m is not None else None
        return metadata
    else:
        raise ValueError(f"No match on this line")
        
parse_seq_metadata(first_lines[72])

{'txid': '2877474', 'accession': 'MZ328303', 'species': None}

# Using 10 sequences


In [20]:
nb_sequences = 10

## Single read simulation - 50 bp read

In [27]:
art = ArtIllumina(path2app=Path('/usr/bin/art_illumina'), input_dir=p2inputs, output_dir=Path('reads'))

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/data/ncbi/refsequences
Output files to :  /home/vtec/projects/bio/metagentools/nbs/art/reads


### Prepare simulated read files

In [30]:
art.list_all_input_files()

cov_virus_sequence_001-seq1.fa
cov_virus_sequence_001-seq2.fa
cov_virus_sequences.fa
cov_virus_sequences_002-seqs.fa
cov_virus_sequences_010-seqs.fa
cov_virus_sequences_025-seqs.fa
cov_virus_sequences_100-seqs.fa


In [31]:
input_fname = 'cov_virus_sequences_010-seqs.fa'

Run a single read simulations with the input file and the following parameters:

In [32]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_50bp'}

In [33]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 7.52899

The random seed for the run: 1705634853

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/nbs/art/reads/single_10seq_50bp/single_10seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/nbs/art/reads/single_10seq_50bp/single_10seq_50bp.aln




In [34]:
art.list_last_output_files()

single_10seq_50bp.fq
single_10seq_50bp.aln


### Create WandB dataset artifact and log it (optional)

In [35]:
# ds_dir = art.output_dir / art.last_output_seed
# print(ds_dir)
# assert ds_dir.is_dir()

# ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# ds_type = 'sim_reads'
# ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

# print(ds_name)
# print(ds_descr)

In [52]:
# afx = wandb_run.upload_dataset(
#     ds_path=ds_dir,
#     ds_name=ds_name, 
#     ds_type=ds_type,
#     ds_descr=ds_descr,
#     ds_metadata=sim_params,
#     load_type='dir'
# )

## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [36]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'q_profile': 'HS25',
 'output_seed': 'single_10seq_150bp'}

In [37]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 4.19221

The random seed for the run: 1705634871

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/nbs/art/reads/single_10seq_150bp/single_10seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/nbs/art/reads/single_10seq_150bp/single_10seq_150bp.aln




In [38]:
art.list_last_output_files()

single_10seq_150bp.fq
single_10seq_150bp.aln


In [39]:
art.list_all_output_files()

single_10seq_150bp
- single_10seq_150bp.fq
- single_10seq_150bp.aln
single_10seq_50bp
- single_10seq_50bp.fq
- single_10seq_50bp.aln


### Create WandB dataset artifact and log it (optional)

In [40]:
# ds_dir = art.output_dir / art.last_output_seed
# print(ds_dir)
# assert ds_dir.is_dir()

# ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# ds_type = 'sim_reads'
# ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

# print(ds_name)
# print(ds_descr)

In [41]:
# afx = wandb_run.upload_dataset(
#     ds_path=ds_dir,
#     ds_name=ds_name, 
#     ds_type=ds_type,
#     ds_descr=ds_descr,
#     ds_metadata=sim_params,
#     load_type='dir'
# )

## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [42]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'paired',
 'read_length': 50,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_50bp'}

In [43]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 7.99872

The random seed for the run: 1705634905

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 126 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/nbs/art/reads/paired_10seq_50bp/paired_10seq_50bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/nbs/art/reads/paired_10seq_50bp/paired_10seq_50bp2.fq

  ALN Alignment Files

In [44]:
art.list_last_output_files()

paired_10seq_50bp2.aln
paired_10seq_50bp1.fq
paired_10seq_50bp1.aln
paired_10seq_50bp2.fq


In [45]:
art.list_all_output_files()

paired_10seq_50bp
- paired_10seq_50bp2.aln
- paired_10seq_50bp1.fq
- paired_10seq_50bp1.aln
- paired_10seq_50bp2.fq
single_10seq_150bp
- single_10seq_150bp.fq
- single_10seq_150bp.aln
single_10seq_50bp
- single_10seq_50bp.fq
- single_10seq_50bp.aln


### Create WandB dataset artifact and log it (optional)

In [46]:
# ds_dir = art.output_dir / art.last_output_seed
# print(ds_dir)
# assert ds_dir.is_dir()

# ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# ds_type = 'sim_reads'
# ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

# print(ds_name)
# print(ds_descr)

In [47]:
# afx = wandb_run.upload_dataset(
#     ds_path=ds_dir,
#     ds_name=ds_name, 
#     ds_type=ds_type,
#     ds_descr=ds_descr,
#     ds_metadata=sim_params,
#     load_type='dir'
# )

## Paired read simulation - 150 bp read


### Prepare simulated read files

In [48]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 100,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_virus_sequences_010-seqs.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 10,
 'fold': 100,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_10seq_150bp'}

In [49]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 4.15743

The random seed for the run: 1705634947

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            100X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/nbs/art/reads/paired_10seq_150bp/paired_10seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/nbs/art/reads/paired_10seq_150bp/paired_10seq_150bp2.fq

  ALN Alignment

In [50]:
art.list_last_output_files()

paired_10seq_150bp1.aln
paired_10seq_150bp2.fq
paired_10seq_150bp2.aln
paired_10seq_150bp1.fq


### Create WandB dataset artifact and log it (optional)

In [51]:
# ds_dir = art.output_dir / art.last_output_seed
# print(ds_dir)
# assert ds_dir.is_dir()

# ds_name = f"cov_sim_reads_{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# ds_type = 'sim_reads'
# ds_descr = f"Simulated reads ({sim_params['sim_type']}) of length {sim_params['read_length']}bp out of {sim_params['nb_sequences']} covid sequences"

# print(ds_name)
# print(ds_descr)

In [52]:
# afx = wandb_run.upload_dataset(
#     ds_path=ds_dir,
#     ds_name=ds_name, 
#     ds_type=ds_type,
#     ds_descr=ds_descr,
#     ds_metadata=sim_params,
#     load_type='dir'
# )

## Close current WandB active run

In [53]:
# wandb_run.finish()