# Convert real CoV read file from FASTA into FASTQ

The real CoV read file received is in FASTA format.

The pipeline used to infer virus label from reads is designed to work with FASTQ files.

We will conver the real CoV read file into a FASTQ file.

# 1. Imports and setup environment

In [None]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [None]:
# Import all required packages
import os

from ecutilities.core import files_in_tree
from ecutilities.ipython import nb_setup
from IPython.display import display, update_display, Markdown, HTML
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm, trange

# Setup the notebook for development
nb_setup()

from metagentools.cnn_virus.data import FastaFileReader, FastqFileReader
from metagentools.cnn_virus.data import OriginalLabels
from metagentools.core import ProjectFileSystem, TextFileBaseReader

Set autoreload mode


List all computing devices available on the machine

# 2. Setup paths to files

Key folders and system information

In [None]:
pfs = ProjectFileSystem()
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


In [None]:
# pfs.set_project_root('/home/vtec/projects/bio/metagentools');

- `p2model`: path to file with saved original pretrained model
- `p2virus_labels` path to file with virus names and labels mapping for original model
- `p2simreads`: path to folder where reads files are located (FASTQ and ALN)

In [None]:
p2fasta = pfs.data / 'ncov_data/reads/yf/yf-mapped.fa'
assert p2fasta.is_file(), f"No file found at {p2fasta.absolute()}"

In [None]:
files_in_tree(path=p2fasta.parent);

reads
  |--yf
  |    |--yf-reads-10.fq (0)
  |    |--yf-mapped.fa (1)


# 3. Load data and review

In [None]:
fasta = FastaFileReader(p2fasta)

In [None]:
fasta.reset_iterator()
for i, fa_read in enumerate(fasta):
    seq = fa_read['sequence']
    defline = fa_read['definition line']

print(f"This file includes {i+1:,d} reads, with the following format:\n")
print(f"{defline}\n{seq[:60]} ... ({len(seq)} bp)")

This file includes 11,736 reads, with the following format:

>A00551:791:HFLNGDSX7:1:2247:19289:3270/2
GGCCGATTAAAGTCCGAATTCGGTGGATCACCAGGTGGATGTGTGAGGCCCATGTAGCCC ... (150 bp)


# 4. Conversion Loop

In [None]:
p2fastq = p2fasta.parent / f"{p2fasta.stem}.fq"
p2fastq.absolute()

PosixPath('/home/vtec/projects/bio/metagentools/data/ncov_data/reads/yf/yf-mapped.fq')

In [None]:
fasta.reset_iterator()
with open(p2fastq, 'w') as fp:
    for i, fa_read in tqdm(enumerate(fasta)):
        fa_defline = fa_read['definition line']
        seq = fa_read['sequence']
        q_score = 'K' * len(seq)
        fp.write(f"@{fa_defline[1:]}" + '\n')
        fp.write(seq + '\n')
        fp.write('+\n')
        fp.write(q_score + '\n')

0it [00:00, ?it/s]

In [None]:
fastq = FastqFileReader(p2fastq)
fastq.print_first_chunks()


Sequence 1:
@A00551:791:HFLNGDSX7:1:2427:6262:35321/2
CCCAGTTAGGATGTTGAACAAAAAGAAGAAAGTAAATCCTGTGTGCTAATTGAGGTGCATTGGTCTGCAAATCGAGTTGC ...

Sequence 2:
@A00551:791:HFLNGDSX7:1:2427:6180:35368/2
CCCAGTTAGGATGTTGAACAAAAAGAAGAAAGTAAATCCTGTGTGCTAATTGAGGTGCATTGGTCTGCAAATCGAGTTGC ...

Sequence 3:
@A00551:791:HFLNGDSX7:1:2428:6406:5353/2
CCCAGTTAGGATGTTGAACAAAAAGAAGAAAGTAAATCCTGTGTGCTAATTGAGGTGCATTGGTCTGCAAATCGAGTTGC ...

Sequence 4:
@A00551:791:HFLNGDSX7:1:1124:13404:31062/1
CCAGGGTTTTACCCTGAGCTTTTCGACCAGACATTATTCTTGTCAGTTCTCTGCAAATCGAGTTGCTAGGCAATAAACAC ...


# New Section

## end of section