# Handle NCBI sequence fasta files

Notebook objective:
- ...

# Imports and setup environment

### Install and import packages

In [None]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('eccore'):
    print('installing package: `eccore`')
    ! pip install -qqU eccore
else:
    print('`eccore` already installed')
if not importlib.util.find_spec('metagentorch'):
    print('installing package: `metagentorch')
    ! pip install -qqU metagentorch
else:
    print('`metagentorch` already installed')

`eccore` already installed
`metagentorch` already installed


In [None]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from eccore.core import files_in_tree, path_to_parent_dir
from eccore.ipython import nb_setup
from metagentorch.art import ArtIllumina
from metagentorch.core import ProjectFileSystem, TextFileBaseReader
from metagentorch.cnn_virus.data import FastaFileReader, FastqFileReader, AlnFileReader
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

Set autoreload mode


## Setup project file system

In [None]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(Path('/home/vtec'),
 Path('/home/vtec/projects/bio/metagentorch'),
 Path('/home/vtec/projects/bio/metagentorch/data'),
 Path('/home/vtec/projects/bio/metagentorch/nbs'))

## Load `fa` file and parse data

Define path to files

In [None]:
p2inputs = pfs.data/ 'ncbi/refsequences'
print(p2inputs.absolute())
assert p2inputs.is_dir()

/home/vtec/projects/bio/metagentorch/data/ncbi/refsequences


Explore files in the directory:

In [None]:
files = files_in_tree(p2inputs, pattern='cov')

ncbi
  |--refsequences
  |    |--mRhiFer1
  |    |--yf
  |    |--cov
  |    |    |--cov_refseq_original.fa (0)
  |    |    |--cov_original_cnn_sequences.json (1)
  |    |    |--cov_refseq_002-seqs.fa (2)
  |    |    |--cov_refseq_025-seqs.fa (3)
  |    |    |--cov_refseq_001-seq2.fa (4)
  |    |    |--cov_refseq_010-seqs.fa (5)
  |    |    |--cov_refseq_list.txt (6)
  |    |    |--cov_refseq.fa (7)
  |    |    |--cov_refseq_001-seq1.fa (8)
  |    |    |--cov_refseq_100-seqs.fa (9)


Pick fasta file and read it

In [None]:
n = 5
files[n].name

'cov_refseq_010-seqs.fa'

In [None]:
fasta = FastaFileReader(files[n])
fasta.print_first_chunks(1)


Sequence 1:
>2591237:ncbi:1	1	MK211378	2591237	ncbi	Coronavirus BtRs-BetaCoV/YN2018D
TATTAGGTTTTCTACCTACCCAGGAAAAGCCAACCAACCTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAAT ...


In [None]:
fasta.reset_iterator()
for refseq in fasta:
    print(refseq['definition line'])
    print(fasta.parse_text(refseq['definition line']))

>2591237:ncbi:1	1	MK211378	2591237	ncbi	Coronavirus BtRs-BetaCoV/YN2018D
{'accession': 'MK211378', 'organism': 'Coronavirus BtRs-BetaCoV/YN2018D', 'seqid': '2591237:ncbi:1', 'seqnb': '1', 'source': 'ncbi', 'taxonomyid': '2591237'}
>11128:ncbi:2	2	LC494191	11128	ncbi	Bovine coronavirus
{'accession': 'LC494191', 'organism': 'Bovine coronavirus', 'seqid': '11128:ncbi:2', 'seqnb': '2', 'source': 'ncbi', 'taxonomyid': '11128'}
>31631:ncbi:3	3	KY967361	31631	ncbi	Human coronavirus OC43
{'accession': 'KY967361', 'organism': 'Human coronavirus OC43', 'seqid': '31631:ncbi:3', 'seqnb': '3', 'source': 'ncbi', 'taxonomyid': '31631'}
>277944:ncbi:4	4	LC654455	277944	ncbi	Human coronavirus NL63
{'accession': 'LC654455', 'organism': 'Human coronavirus NL63', 'seqid': '277944:ncbi:4', 'seqnb': '4', 'source': 'ncbi', 'taxonomyid': '277944'}
>11120:ncbi:5	5	MN987231	11120	ncbi	Infectious bronchitis virus
{'accession': 'MN987231', 'organism': 'Infectious bronchitis virus', 'seqid': '11120:ncbi:5', 'seqnb

# Simulate reads using 1 sequence

In [None]:
n = 8
p2inputs = files[n].parent
p2inputs.absolute()

Path('/home/vtec/projects/bio/metagentorch/data/ncbi/refsequences/cov')

## Single read simulation - 50 bp read

In [None]:
p2simread_outputs = pfs.data / 'ncbi/simreads/cov'
assert p2simread_outputs.is_dir()
p2simread_outputs

Path('/home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov')

In [None]:
art = ArtIllumina(path2app=Path('/usr/bin/art_illumina'), input_dir=p2inputs, output_dir=p2simread_outputs)

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentorch/data/ncbi/refsequences/cov
Output files to :  /home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov


### Prepare simulated read files

In [None]:
art.list_all_input_files()

cov_refseq.fa
cov_refseq_001-seq1.fa
cov_refseq_001-seq2.fa
cov_refseq_002-seqs.fa
cov_refseq_010-seqs.fa
cov_refseq_025-seqs.fa
cov_refseq_100-seqs.fa
cov_refseq_original.fa


In [None]:
input_fname = 'cov_refseq_001-seq1.fa'
nb_sequences = 1

Run a single read simulations with the input file and the following parameters:

Parameter `fold`:

Fold coverage, also known as sequencing depth or read depth, represents the average number of times each base in the reference genome is expected to be sequenced. For example:
- If you set -f 20, it means you're simulating a sequencing run that would cover each base in the reference genome an average of 20 times.
- If you set -f 100, it would simulate coverage where each base is sequenced an average of 100 times.

The fold coverage is an important parameter because it affects:
- The total number of reads generated: Higher fold coverage results in more reads.
- The likelihood of capturing rare variants or sequencing errors: Higher coverage generally improves the ability to detect rare variants and distinguish true variants from sequencing errors.
- The overall quality of the simulated dataset: Higher coverage typically leads to more accurate representation of the reference genome in the simulated data.

It's worth noting that ART Illumina uses this fold coverage value along with the read length and reference genome size to calculate the total number of reads to generate. The actual formula is:

```Total number of reads = (Genome size * Fold coverage) / Read length```

In [None]:
genome_size = 10_238
(genome_size * 200) // 150

13650

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_refseq_001-seq1.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 1,
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_1seq_50bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 1.64234

The random seed for the run: 1738485338

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            200X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/single_1seq_50bp/single_1seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/single_1seq_50bp/single_1seq_50bp.aln




In [None]:
art.list_last_output_files()

single_1seq_50bp.aln
single_1seq_50bp.fq


## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_refseq_001-seq1.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 1,
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_1seq_150bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 0.903706

The random seed for the run: 1738485340

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/single_1seq_150bp/single_1seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/single_1seq_150bp/single_1seq_150bp.aln




In [None]:
art.list_last_output_files()

single_1seq_150bp.fq
single_1seq_150bp.aln


In [None]:
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [None]:
# sim_params = {
#     'input_file': input_fname,
#     "sim_type": "paired",
#     "read_length": 50,
#     'nb_sequences': nb_sequences,
#     "fold": 200,
#     'mean_read':200,
#     'std_read':10,
#     'q_profile': 'HS25'
# }

# sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# sim_params

In [None]:
# art.sim_reads(
#     input_file=sim_params['input_file'],
#     output_seed=sim_params['output_seed'],
#     sim_type=sim_params['sim_type'],
#     read_length=sim_params['read_length'],
#     fold=sim_params['fold'],
#     mean_read=sim_params['mean_read'],
#     std_read=sim_params['std_read'],
#     overwrite=True
# )   

In [None]:
# art.list_last_output_files()

In [None]:
# art.list_all_output_files()

## Paired read simulation - 150 bp read


### Prepare simulated read files

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_refseq_001-seq1.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 1,
 'fold': 200,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_1seq_150bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 0.818589

The random seed for the run: 1738485343

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/paired_1seq_150bp/paired_1seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov/paired_1seq_150bp/paired_1seq_150bp2.fq



In [None]:
art.list_last_output_files()

paired_1seq_150bp2.aln
paired_1seq_150bp2.fq
paired_1seq_150bp1.fq
paired_1seq_150bp1.aln


# Simulate using all sequences


In [None]:
files = files_in_tree(p2inputs, pattern='cov')

refsequences
  |--cov
  |    |--cov_refseq_original.fa (0)
  |    |--cov_original_cnn_sequences.json (1)
  |    |--cov_refseq_002-seqs.fa (2)
  |    |--cov_refseq_025-seqs.fa (3)
  |    |--cov_refseq_001-seq2.fa (4)
  |    |--cov_refseq_010-seqs.fa (5)
  |    |--cov_refseq_list.txt (6)
  |    |--cov_refseq.fa (7)
  |    |--cov_refseq_001-seq1.fa (8)
  |    |--cov_refseq_100-seqs.fa (9)


In [None]:
n = 7
p2inputs = files[n].parent
p2inputs.absolute()

Path('/home/vtec/projects/bio/metagentorch/data/ncbi/refsequences/cov')

In [None]:
p2simread_outputs = pfs.data / 'ncbi/simreads/cov'
assert p2simread_outputs.is_dir()

In [None]:
art = ArtIllumina(path2app=Path('/usr/bin/art_illumina'), input_dir=p2inputs, output_dir=p2simread_outputs)

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentorch/data/ncbi/refsequences/cov
Output files to :  /home/vtec/projects/bio/metagentorch/data/ncbi/simreads/cov


In [None]:
art.list_all_input_files()

cov_refseq.fa
cov_refseq_001-seq1.fa
cov_refseq_001-seq2.fa
cov_refseq_002-seqs.fa
cov_refseq_010-seqs.fa
cov_refseq_025-seqs.fa
cov_refseq_100-seqs.fa
cov_refseq_original.fa


In [None]:
input_fname = 'cov_refseq.fa'

## Single read simulation - 50 bp read

Run a single read simulations with the input file and the following parameters:

In [None]:
# sim_params = {
#     'input_file': input_fname,
#     "sim_type": "single",
#     "read_length": 50,
#     'nb_sequences': 'all_',
#     "fold": 200,
#     'q_profile': 'HS25'
# }

# sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# sim_params

In [None]:
# art.sim_reads( 
#     input_file=sim_params['input_file'],
#     output_seed=sim_params['output_seed'],
#     sim_type=sim_params['sim_type'],
#     read_length=sim_params['read_length'],
#     fold=sim_params['fold'],
#     ss=sim_params['q_profile'],
#     overwrite=True
# )

In [None]:
# art.list_last_output_files()

## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': 'all_',
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'cov_refseq.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 'all_',
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_all_seq_150bp'}

In [None]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

In [None]:
art.list_last_output_files()

single_all_seq_150bp.fq
single_all_seq_150bp.aln


In [None]:
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
paired_1seq_50bp
- paired_1seq_50bp2.aln
- paired_1seq_50bp1.fq
- paired_1seq_50bp2.fq
- paired_1seq_50bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq
single_all_seq_150bp
- single_all_seq_150bp.fq
- single_all_seq_150bp.aln


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [None]:
# sim_params = {
#     'input_file': input_fname,
#     "sim_type": "paired",
#     "read_length": 50,
#     'nb_sequences': 'all_',
#     "fold": 200,
#     'mean_read':200,
#     'std_read':10,
#     'q_profile': 'HS25'
# }

# sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# sim_params

In [None]:
# art.sim_reads(
#     input_file=sim_params['input_file'],
#     output_seed=sim_params['output_seed'],
#     sim_type=sim_params['sim_type'],
#     read_length=sim_params['read_length'],
#     fold=sim_params['fold'],
#     mean_read=sim_params['mean_read'],
#     std_read=sim_params['std_read'],
#     overwrite=True
# )   

In [None]:
# art.list_last_output_files()

In [None]:
# art.list_all_output_files()

## Paired read simulation - 150 bp read


### Prepare simulated read files

In [None]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': 'all_',
    "fold": 200,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_2023_yellow_fever.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 'all_',
 'fold': 200,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_all_seq_150bp'}

In [None]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 19.8468

The random seed for the run: 1723183668

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_15

In [None]:
art.list_last_output_files()

paired_all_seq_150bp2.fq
paired_all_seq_150bp2.aln
paired_all_seq_150bp1.fq
paired_all_seq_150bp1.aln


In [None]:
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
paired_1seq_50bp
- paired_1seq_50bp2.aln
- paired_1seq_50bp1.fq
- paired_1seq_50bp2.fq
- paired_1seq_50bp1.aln
paired_all_seq_150bp
- paired_all_seq_150bp2.fq
- paired_all_seq_150bp2.aln
- paired_all_seq_150bp1.fq
- paired_all_seq_150bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq
single_all_seq_150bp
- single_all_seq_150bp.fq
- single_all_seq_150bp.aln


In [None]:
last_output = art.get_last_output_files()
last_output

[Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp2.fq'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp2.aln'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.fq'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.aln')]

In [None]:
fq = FastqFileReader(Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.fq'))

In [None]:
fq.print_first_chunks(3)


Sequence 1:
@11089:ncbi:1-13596
GGGCTGTTTGGTGGCTTGAGTTGGATTACGAAAGTCATCATGGGAGCCGTACTCATCTGGGTGGGAATAAACACCCGCAA ...

Sequence 2:
@11089:ncbi:1-13595
TCAAAGCTGGCTCAAAAAAGAGTTTTTCATGGAGTGGCAAAAAATCCAGTTGTTGATGGTAATCCAACTGCTGACATTGA ...

Sequence 3:
@11089:ncbi:1-13594
GGTTGTCATTGTCATAGAACCATGTAGTGGCATATTCATTCTTGATTCTCTCAACTCTTTCCTCAATAGCATCCTTATCC ...

Sequence 4:
@11089:ncbi:1-13593
TAGGCCACTCTGACATTTTCCACTCCATAACACCAGCAATCTATGTCATCTGGCTCTTCTCTTGGACTGAGATTGGGACA ...


In [None]:
aln = AlnFileReader(Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.aln'))
aln.print_first_chunks(3)

3-line chunk 1
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13600\t5511\t-', 'ref_seq_aligned': 'CCCATCTCCCGTCTAGTTTCCAGGAACCACCATAAGCCACCAGATCTTCCTTCACAGAAGCCCAAGACGGAACCAGTTTCTTTCCATTCCTCAAGAGGAATGCTCCCCTAGTGACATGCCACATTGTGTGGAAGACCCCTCCCTGCGCCA', 'read_seq_aligned': 'CCCATCTCCCGTCTAGTTTCCAGGAACCACCATAAGCCACCAGATCTTCCTTCACAGAAGCCCAAGACGGAACCAGTTTCTTTCCATTCCTCAAGAGGAATGCTCCCCTAGTGACATGCCACATTGTGTGGAAGACCCCTCCCTGCGCCA'}
3-line chunk 2
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13599\t6568\t+', 'ref_seq_aligned': 'CAATGGGTACCATGGCTGGCAGTGGATATCTCATGTTTTTGGGGGGAGTAAAACCAACCCACATCTCTTACGTCATGTTAATATTCTTTGTCCTCATGGTCGTCGTAATTCCCGAACCAGGACAGCAGAGAACAATCCAGGATAACCAAG', 'read_seq_aligned': 'CAATGGGTACCATGGCTGGCAGTGGATATCTCATGTTTTTGGGGGGAGTAAAACCAACCCACATCTCTTACGTCATGTTAATATTCTTTGTCCTCATGGTCGTCGTAATTCCCGAACCAGGACAGCAGAGAACAATCCAGGATAACCAAG'}
3-line chunk 3
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13598\t4267\t+', 'ref_seq_aligned': 'GAAGCTCTAGCCGCTATGATGTGGCACTCAGTGAGCAGGGTGA