# Handle NCBI sequence fasta files

Notebook objective:
- ...

# Imports and setup environment

### Install and import packages

In [1]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [2]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re

from ecutilities.core import files_in_tree, path_to_parent_dir
from ecutilities.ipython import nb_setup
from metagentools.art import ArtIllumina
from metagentools.core import ProjectFileSystem
from metagentools.cnn_virus.data import FastaFileReader, FastqFileReader, AlnFileReader
from pathlib import Path
from pprint import pprint

# Setup the notebook for development
nb_setup()

Set autoreload mode


## Setup project file system

In [3]:
pfs = ProjectFileSystem()
pfs.home, pfs.project_root, pfs.data, pfs.nbs

(Path('/home/vtec'),
 Path('/home/vtec/projects/bio/metagentools'),
 Path('/home/vtec/projects/bio/metagentools/data'),
 Path('/home/vtec/projects/bio/metagentools/nbs'))

## Load `fa` file and parse data

Define path to files

In [4]:
p2inputs = pfs.data/ 'ncbi/refsequences'
print(p2inputs.absolute())
assert p2inputs.is_dir()

/home/vtec/projects/bio/metagentools/data/ncbi/refsequences


Explore files in the directory:

In [5]:
files = files_in_tree(p2inputs, pattern='yf')

ncbi
  |--refsequences
  |    |--mRhiFer1
  |    |--yf
  |    |    |--yf_2023_yellow_fever.fa (0)
  |    |    |--yf_2023_multiple_alignment_original.fa (1)
  |    |    |--yf_1971_Angola.fa (2)
  |    |--cov


Pick fasta file and read it

In [6]:
n = 2
files[n].name

'yf_1971_Angola.fa'

In [7]:
fasta = FastaFileReader(files[n])
fasta.print_first_chunks(1)


Sequence 1:
>11089:ncbi:1	[AY968064]	11089	ncbi	1	[AY968064]	11089	Angola_1971
ATGTCTGGTCGAAAAGCTCAGGGTAAAACCCTGGGCGTCAATATGGTAAGACGAGGGGTTCGCTCCTTGTCAAACAAAAT ...


In [8]:
fasta.reset_iterator()
for refseq in fasta:
    print(refseq['definition line'])
    print(fasta.parse_text(refseq['definition line']))

>11089:ncbi:1	[AY968064]	11089	ncbi	1	[AY968064]	11089	Angola_1971
{'accession': 'AY968064', 'seqid': '11089:ncbi:1', 'seqnb': '1', 'source': 'ncbi', 'species': 'Angola_1971', 'taxonomyid': '11089'}


# Simulate reads using 1 sequence

In [9]:
n = 2
p2inputs = files[n].parent
p2inputs.absolute()

Path('/home/vtec/projects/bio/metagentools/data/ncbi/refsequences/yf')

## Single read simulation - 50 bp read

In [11]:
p2simread_outputs = pfs.data / 'ncbi/simreads/yf'
assert p2simread_outputs.is_dir()
p2simread_outputs

Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf')

In [12]:
art = ArtIllumina(path2app=Path('/usr/bin/art_illumina'), input_dir=p2inputs, output_dir=p2simread_outputs)

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/data/ncbi/refsequences/yf
Output files to :  /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf


### Prepare simulated read files

In [13]:
art.list_all_input_files()

yf_1971_Angola.fa
yf_2023_multiple_alignment_original.fa
yf_2023_yellow_fever.fa


In [14]:
input_fname = 'yf_1971_Angola.fa'
nb_sequences = 1

Run a single read simulations with the input file and the following parameters:

Parameter `fold`:

Fold coverage, also known as sequencing depth or read depth, represents the average number of times each base in the reference genome is expected to be sequenced. For example:
- If you set -f 20, it means you're simulating a sequencing run that would cover each base in the reference genome an average of 20 times.
- If you set -f 100, it would simulate coverage where each base is sequenced an average of 100 times.

The fold coverage is an important parameter because it affects:
- The total number of reads generated: Higher fold coverage results in more reads.
- The likelihood of capturing rare variants or sequencing errors: Higher coverage generally improves the ability to detect rare variants and distinguish true variants from sequencing errors.
- The overall quality of the simulated dataset: Higher coverage typically leads to more accurate representation of the reference genome in the simulated data.

It's worth noting that ART Illumina uses this fold coverage value along with the read length and reference genome size to calculate the total number of reads to generate. The actual formula is:

```Total number of reads = (Genome size * Fold coverage) / Read length```

In [15]:
genome_size = 10_238
(genome_size * 200) // 150

13650

In [16]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_1971_Angola.fa',
 'sim_type': 'single',
 'read_length': 50,
 'nb_sequences': 1,
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_1seq_50bp'}

In [17]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 0.594318

The random seed for the run: 1723183515

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            200X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_1seq_50bp/single_1seq_50bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_1seq_50bp/single_1seq_50bp.aln




In [18]:
art.list_last_output_files()

single_1seq_50bp.aln
single_1seq_50bp.fq


## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [19]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_1971_Angola.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 1,
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_1seq_150bp'}

In [20]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 0.360058

The random seed for the run: 1723183534

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_1seq_150bp/single_1seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_1seq_150bp/single_1seq_150bp.aln




In [21]:
art.list_last_output_files()

single_1seq_150bp.fq
single_1seq_150bp.aln


In [22]:
art.list_all_output_files()

single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [23]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 50,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_1971_Angola.fa',
 'sim_type': 'paired',
 'read_length': 50,
 'nb_sequences': 1,
 'fold': 200,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_1seq_50bp'}

In [24]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)   

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 0.648225

The random seed for the run: 1723183550

Parameters used during run
	Read Length:	50
	Genome masking 'N' cutoff frequency: 	1 in 50
	Fold Coverage:            200X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 126 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 126 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_1seq_50bp/paired_1seq_50bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_1seq_50bp/paired_1seq_50bp2.fq

  ALN Al

In [25]:
art.list_last_output_files()

paired_1seq_50bp2.aln
paired_1seq_50bp1.fq
paired_1seq_50bp2.fq
paired_1seq_50bp1.aln


In [26]:
art.list_all_output_files()

paired_1seq_50bp
- paired_1seq_50bp2.aln
- paired_1seq_50bp1.fq
- paired_1seq_50bp2.fq
- paired_1seq_50bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq


## Paired read simulation - 150 bp read


### Prepare simulated read files

In [27]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': nb_sequences,
    "fold": 200,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_1971_Angola.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 1,
 'fold': 200,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_1seq_150bp'}

In [28]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 0.400434

The random seed for the run: 1723183564

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_1seq_150bp/paired_1seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_1seq_150bp/paired_1seq_150bp2.fq

  

In [29]:
art.list_last_output_files()

paired_1seq_150bp2.aln
paired_1seq_150bp2.fq
paired_1seq_150bp1.fq
paired_1seq_150bp1.aln


# Simulate using all sequences


In [30]:
files = files_in_tree(p2inputs, pattern='yf')

refsequences
  |--yf
  |    |--yf_2023_yellow_fever.fa (0)
  |    |--yf_2023_multiple_alignment_original.fa (1)
  |    |--yf_1971_Angola.fa (2)


In [31]:
n = 0
p2inputs = files[n].parent
p2inputs.absolute()

Path('/home/vtec/projects/bio/metagentools/data/ncbi/refsequences/yf')

In [32]:
p2simread_outputs = pfs.data / 'ncbi/simreads/yf'
assert p2simread_outputs.is_dir()

In [33]:
art = ArtIllumina(path2app=Path('/usr/bin/art_illumina'), input_dir=p2inputs, output_dir=p2simread_outputs)

Ready to operate with art: /usr/bin/art_illumina
Input files from : /home/vtec/projects/bio/metagentools/data/ncbi/refsequences/yf
Output files to :  /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf


In [34]:
art.list_all_input_files()

yf_1971_Angola.fa
yf_2023_multiple_alignment_original.fa
yf_2023_yellow_fever.fa


In [35]:
input_fname = 'yf_2023_yellow_fever.fa'

## Single read simulation - 50 bp read

Run a single read simulations with the input file and the following parameters:

In [36]:
# sim_params = {
#     'input_file': input_fname,
#     "sim_type": "single",
#     "read_length": 50,
#     'nb_sequences': 'all_',
#     "fold": 200,
#     'q_profile': 'HS25'
# }

# sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# sim_params

In [37]:
# art.sim_reads( 
#     input_file=sim_params['input_file'],
#     output_seed=sim_params['output_seed'],
#     sim_type=sim_params['sim_type'],
#     read_length=sim_params['read_length'],
#     fold=sim_params['fold'],
#     ss=sim_params['q_profile'],
#     overwrite=True
# )

In [38]:
# art.list_last_output_files()

## Single read simulation - 150 bp read

### Prepare simulated read files

Run a single read simulations with the input file and the following parameters:

In [39]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "single",
    "read_length": 150,
    'nb_sequences': 'all_',
    "fold": 200,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_2023_yellow_fever.fa',
 'sim_type': 'single',
 'read_length': 150,
 'nb_sequences': 'all_',
 'fold': 200,
 'q_profile': 'HS25',
 'output_seed': 'single_all_seq_150bp'}

In [40]:
art.sim_reads( 
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    ss=sim_params['q_profile'],
    overwrite=True
)

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Single-end Simulation

Total CPU time used: 19.3748

The random seed for the run: 1723183621

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 

Output files

  FASTQ Sequence File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.fq

  ALN Alignment File:
	/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.aln




In [41]:
art.list_last_output_files()

single_all_seq_150bp.fq
single_all_seq_150bp.aln


In [42]:
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
paired_1seq_50bp
- paired_1seq_50bp2.aln
- paired_1seq_50bp1.fq
- paired_1seq_50bp2.fq
- paired_1seq_50bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq
single_all_seq_150bp
- single_all_seq_150bp.fq
- single_all_seq_150bp.aln


## Paired read simulation - 50 bp read

### Prepare simulated read files

Run a paired read simulations with the input file.

In [43]:
# sim_params = {
#     'input_file': input_fname,
#     "sim_type": "paired",
#     "read_length": 50,
#     'nb_sequences': 'all_',
#     "fold": 200,
#     'mean_read':200,
#     'std_read':10,
#     'q_profile': 'HS25'
# }

# sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
# sim_params

In [44]:
# art.sim_reads(
#     input_file=sim_params['input_file'],
#     output_seed=sim_params['output_seed'],
#     sim_type=sim_params['sim_type'],
#     read_length=sim_params['read_length'],
#     fold=sim_params['fold'],
#     mean_read=sim_params['mean_read'],
#     std_read=sim_params['std_read'],
#     overwrite=True
# )   

In [45]:
# art.list_last_output_files()

In [46]:
# art.list_all_output_files()

## Paired read simulation - 150 bp read


### Prepare simulated read files

In [47]:
sim_params = {
    'input_file': input_fname,
    "sim_type": "paired",
    "read_length": 150,
    'nb_sequences': 'all_',
    "fold": 200,
    'mean_read':200,
    'std_read':10,
    'q_profile': 'HS25'
}

sim_params['output_seed'] = f"{sim_params['sim_type']}_{sim_params['nb_sequences']}seq_{sim_params['read_length']}bp"
sim_params

{'input_file': 'yf_2023_yellow_fever.fa',
 'sim_type': 'paired',
 'read_length': 150,
 'nb_sequences': 'all_',
 'fold': 200,
 'mean_read': 200,
 'std_read': 10,
 'q_profile': 'HS25',
 'output_seed': 'paired_all_seq_150bp'}

In [48]:
art.sim_reads(
    input_file=sim_params['input_file'],
    output_seed=sim_params['output_seed'],
    sim_type=sim_params['sim_type'],
    read_length=sim_params['read_length'],
    fold=sim_params['fold'],
    mean_read=sim_params['mean_read'],
    std_read=sim_params['std_read'],
    overwrite=True
)  

return code:  0 


             ART_Illumina (2008-2016)          
          Q Version 2.5.8 (June 6, 2016)       
     Contact: Weichun Huang <whduke@gmail.com> 
    -------------------------------------------

                  Paired-end sequencing simulation

Total CPU time used: 19.8468

The random seed for the run: 1723183668

Parameters used during run
	Read Length:	150
	Genome masking 'N' cutoff frequency: 	1 in 150
	Fold Coverage:            200X
	Mean Fragment Length:     200
	Standard Deviation:       10
	Profile Type:             Combined
	ID Tag:                   

Quality Profile(s)
	First Read:   HiSeq 2500 Length 150 R1 (built-in profile) 
	First Read:   HiSeq 2500 Length 150 R2 (built-in profile) 

Output files

  FASTQ Sequence Files:
	 the 1st reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.fq
	 the 2nd reads: /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_15

In [49]:
art.list_last_output_files()

paired_all_seq_150bp2.fq
paired_all_seq_150bp2.aln
paired_all_seq_150bp1.fq
paired_all_seq_150bp1.aln


In [50]:
art.list_all_output_files()

paired_1seq_150bp
- paired_1seq_150bp2.aln
- paired_1seq_150bp2.fq
- paired_1seq_150bp1.fq
- paired_1seq_150bp1.aln
paired_1seq_50bp
- paired_1seq_50bp2.aln
- paired_1seq_50bp1.fq
- paired_1seq_50bp2.fq
- paired_1seq_50bp1.aln
paired_all_seq_150bp
- paired_all_seq_150bp2.fq
- paired_all_seq_150bp2.aln
- paired_all_seq_150bp1.fq
- paired_all_seq_150bp1.aln
single_1seq_150bp
- single_1seq_150bp.fq
- single_1seq_150bp.aln
single_1seq_50bp
- single_1seq_50bp.aln
- single_1seq_50bp.fq
single_all_seq_150bp
- single_all_seq_150bp.fq
- single_all_seq_150bp.aln


In [52]:
last_output = art.get_last_output_files()
last_output

[Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp2.fq'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp2.aln'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.fq'),
 Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/paired_all_seq_150bp/paired_all_seq_150bp1.aln')]

In [53]:
fq = FastqFileReader(Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.fq'))

In [55]:
fq.print_first_chunks(3)


Sequence 1:
@11089:ncbi:1-13596
GGGCTGTTTGGTGGCTTGAGTTGGATTACGAAAGTCATCATGGGAGCCGTACTCATCTGGGTGGGAATAAACACCCGCAA ...

Sequence 2:
@11089:ncbi:1-13595
TCAAAGCTGGCTCAAAAAAGAGTTTTTCATGGAGTGGCAAAAAATCCAGTTGTTGATGGTAATCCAACTGCTGACATTGA ...

Sequence 3:
@11089:ncbi:1-13594
GGTTGTCATTGTCATAGAACCATGTAGTGGCATATTCATTCTTGATTCTCTCAACTCTTTCCTCAATAGCATCCTTATCC ...

Sequence 4:
@11089:ncbi:1-13593
TAGGCCACTCTGACATTTTCCACTCCATAACACCAGCAATCTATGTCATCTGGCTCTTCTCTTGGACTGAGATTGGGACA ...


In [57]:
aln = AlnFileReader(Path('/home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_all_seq_150bp/single_all_seq_150bp.aln'))
aln.print_first_chunks(3)

3-line chunk 1
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13600\t5511\t-', 'ref_seq_aligned': 'CCCATCTCCCGTCTAGTTTCCAGGAACCACCATAAGCCACCAGATCTTCCTTCACAGAAGCCCAAGACGGAACCAGTTTCTTTCCATTCCTCAAGAGGAATGCTCCCCTAGTGACATGCCACATTGTGTGGAAGACCCCTCCCTGCGCCA', 'read_seq_aligned': 'CCCATCTCCCGTCTAGTTTCCAGGAACCACCATAAGCCACCAGATCTTCCTTCACAGAAGCCCAAGACGGAACCAGTTTCTTTCCATTCCTCAAGAGGAATGCTCCCCTAGTGACATGCCACATTGTGTGGAAGACCCCTCCCTGCGCCA'}
3-line chunk 2
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13599\t6568\t+', 'ref_seq_aligned': 'CAATGGGTACCATGGCTGGCAGTGGATATCTCATGTTTTTGGGGGGAGTAAAACCAACCCACATCTCTTACGTCATGTTAATATTCTTTGTCCTCATGGTCGTCGTAATTCCCGAACCAGGACAGCAGAGAACAATCCAGGATAACCAAG', 'read_seq_aligned': 'CAATGGGTACCATGGCTGGCAGTGGATATCTCATGTTTTTGGGGGGAGTAAAACCAACCCACATCTCTTACGTCATGTTAATATTCTTTGTCCTCATGGTCGTCGTAATTCCCGAACCAGGACAGCAGAGAACAATCCAGGATAACCAAG'}
3-line chunk 3
{'definition line': '>11089:ncbi:1\t11089:ncbi:1-13598\t4267\t+', 'ref_seq_aligned': 'GAAGCTCTAGCCGCTATGATGTGGCACTCAGTGAGCAGGGTGA