# Inference from simple fastq file including k-mer reads

This notebook provides a reference implementation of the inference applied on k-mer reads, stored in a simple fastq file.

# 1. Imports and setup environment

In [5]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [6]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from datetime import datetime
from ecutilities.core import files_in_tree
from ecutilities.ipython import nb_setup
from functools import partial
from IPython.display import display, update_display, Markdown, HTML
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm, trange

# Setup the notebook for development
nb_setup()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.models import load_model
print(f"Tensorflow version: {tf.__version__}\n")

from metagentools.cnn_virus.data import _base_hot_encode, split_kmer_into_50mers, combine_predictions
from metagentools.cnn_virus.data import FastqFileReader, AlnFileReader
from metagentools.cnn_virus.data import OriginalLabels
from metagentools.cnn_virus.architecture import create_model_original
from metagentools.core import ProjectFileSystem, TextFileBaseReader

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Set autoreload mode
Tensorflow version: 2.8.2



List all computing devices available on the machine

In [7]:
devices = device_lib.list_local_devices()
print('\nDevices:')
for d in devices:
    t = d.device_type
    name = d.physical_device_desc
    l = [item.split(':', 1) for item in name.split(', ')]
    name_attr = dict([x for x in l if len(x)==2])
    dev = name_attr.get('name', ' ')
    print(f"  - {t}  {d.name} {dev:25s}")


Devices:
  - CPU  /device:CPU:0                          
  - GPU  /device:GPU:0  NVIDIA GeForce GTX 1050 


# 2. Setup paths to files

Key folders and system information

In [8]:
pfs = ProjectFileSystem()
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


- `p2model`: path to file with saved original pretrained model
- `p2virus_labels` path to file with virus names and labels mapping for original model
- `p2simreads`: path to folder where reads files are located (FASTQ and ALN)

In [11]:
p2model = pfs.data / 'saved/cnn_virus_original/pretrained_model.h5'
assert p2model.is_file(), f"No file found at {p2model.absolute()}"

p2virus_labels = pfs.data / 'CNN_Virus_data/virus_name_mapping'
assert p2virus_labels.is_file(), f"No file found at {p2virus_labels.absolute()}"

# p2fastq = pfs.data / 'ncbi/simreads/single_1seq_50bp_10reads/single_1seq_50bp_10reads.fq'
# p2fastq = pfs.data / 'ncbi/simreads/single_10seq_150bp_20reads/single_10seq_150bp_20reads.fq'
p2fastq = pfs.data / 'ncbi/simreads/single_10seq_150bp/single_10seq_150bp.fq'
assert p2fastq.is_file(), f"No file found at {p2fastq.absolute()}"

In [12]:
files_in_tree(path=p2fastq.parent);

simreads
  |--single_10seq_150bp
  |    |--single_10seq_150bp.fq (0)
  |    |--single_10seq_150bp.aln (1)


# 3. Load data and review

In [13]:
fastq = FastqFileReader(p2fastq)
aln = AlnFileReader(p2fastq.with_suffix('.aln'))

In [14]:
fastq.reset_iterator()
for i, fq_read in enumerate(fastq):
    seq = fq_read['sequence']
    defline = fq_read['definition line']

print(f"This file includes {i+1:,d} reads, with the following format:\n")
print(f"{defline}\n{seq[:60]} ... ({len(seq)} bp)")

This file includes 190,270 reads, with the following format:

@1699095:ncbi:10-1
GCAAGCTGGTGTTGGTGTTAAGTACTTTTGTGGCATGACTCTTAAGTTTGTTGCAAACAT ... (150 bp)


# 4. Prediction Loop

- define output file
- load model
- iterate through the fastq file, read by read
    - get read k-mer sequence
    - base encode k-mer sequence and preprocess into (k-50+1) 50-mer sequences
    - prediction for each 50-mer sequence
    - combine all predictions into a single label and position prediction
    - add prediction and metadata to the output file
    

In [15]:
fname_seed = p2fastq.stem
p2results = pfs.data / f"ncbi/infer_results/csv/{fname_seed}_results_{datetime.now().strftime('%Y%m%d_%Hh%Mm%S')}.csv"
p2results.name

'single_10seq_150bp_results_20240129_16h02m56.csv'

In [16]:
model = create_model_original(path2parameters=p2model)
# model.summary()

Creating CNN Model (Original)
Loading parameters from pretrained_model.h5
Created pretrained model


Create class to convert original labels into species name

In [17]:
labels = OriginalLabels()
for n in [94, 117, 118]:
    print(labels.label2species(n))

Middle_East_respiratory_syndrome-related_coronavirus
Severe_acute_respiratory_syndrome-related_coronavirus
Yellow_fever_virus


In [18]:
fastq.reset_iterator()
aln.reset_iterator()

with open(p2results, 'a') as fp:
    line = 'ID\tSequence\tQ Score\tMax Prob Error\tNCBI Ref Species\tPredicted Label\tPredicted Species\tPredicted Position\n'
    fp.write(line)
    for i, (fqelement, alnelement) in tqdm(enumerate(zip(fastq, aln))):
        # Get k-mer read and some metadata
        readid = fqelement['definition line']
        read = fqelement['sequence']
        read_qscores = fqelement['read_qscores']
        max_prob_error = fqelement['probs error'].max()
        ncbi_species = aln.ref_sequences[readid.replace('@', '').split('-')[0]]['species']

        # Split one k-mer read into 50-mer reads
        bhe_kmer, _, _ = _base_hot_encode(f"{read}\t0\t0")
        bhe_50mer = split_kmer_into_50mers(kmer=bhe_kmer)

        # Predict labels and positions for each 50-mer reads and combine
        label_probs, pos_probs = model.predict(bhe_50mer)
        label_preds = tf.argmax(label_probs, axis=-1)
        pos_preds = tf.argmax(pos_probs, axis=-1) 
        combined_label, combined_pos = combine_predictions(label_preds, label_probs, pos_preds)

        line = f"{readid}\t{read}\t{read_qscores}\t{max_prob_error}\t{ncbi_species}\t{combined_label}\t{labels.label2species(combined_label)}\t{combined_pos}\n"
        fp.write(line)
        # if i >= 5: break

0it [00:00, ?it/s]

In [None]:
df = pd.read_csv(p2results, sep='\t')
coi = ['ID', 'NCBI Ref Species', 'Predicted Species', 'Predicted Label', 'Predicted Position', 'Max Prob Error', 'sequence', 'Q Score']
df.loc[:, coi]

Unnamed: 0,ID,NCBI Ref Species,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,sequence,Q Score
0,@2591237:ncbi:1-20100,Coronavirus BtRs-BetaCoV/YN2018D scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,2,0.001585,TTTGACTCATGGTTTAGCCAGCGTGGTGGTTCATACAAAAATGACA...,CCCGGGGGGGGGGJGJGJJJGJJJJJJGJGJJJGJJGJJCCGGJGJ...
1,@2591237:ncbi:1-20099,Coronavirus BtRs-BetaCoV/YN2018D scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,9,0.025119,TCACCAAGAGCTCTACTCACCGCTTTTTCTCATTGTTGCTGCTCTA...,CCCCCGC1GGGGCCJJJJJJJJJJJJGGJJJ=GJGJGGJJC8GGCJ...
2,@11128:ncbi:2-20600,Bovine coronavirus scientific name,Rotavirus_C,115,1,0.199526,TTTCAAGTCTAGCCGGTGATGAGGGATTGATAGTGATTTTATAGTT...,1C1GGCGGCGCGGJGJGGGJJGJG(JJJJJJJGJCGJJJJJJJJGJ...
3,@11128:ncbi:2-20599,Bovine coronavirus scientific name,Cercopithecine_alphaherpesvirus_9,22,1,0.199526,TATCTGAATACTACAACATTAGCTGTACCTGTTAATATGCGAGTTT...,CC1GGGGGGGGGGJJGJ=GJJGJGGGGJJJCJ8CGGJJJJJJGGGJ...
4,@31631:ncbi:3-20400,Human coronavirus OC43 scientific name,Human_gammaherpesvirus_4,62,8,0.199526,TGGTCTTCACAATAATATGACATACCACCTAGATACAATTTGGTAA...,CCC1GGGGGGGGGJCJGGJJCGJJ11JGJJJGJGGG=JGJ=JGGGG...
5,@31631:ncbi:3-20399,Human coronavirus OC43 scientific name,Human_betaherpesvirus_6B,4,1,0.025119,ATCATACGGTTACTGATGTTAAGCAAGTTGGTTGTTCTATGCGCTT...,C=CGG=GGCGGGGJJGJGGJGJJJJCGGJJJJJGGJG8JJJCJ=JG...
6,@277944:ncbi:4-18300,Human coronavirus NL63 scientific name,Puumala_orthohantavirus,48,2,0.025119,TACCTCATGATCAAATTCAGACTTTGCAATATTCATGGCACGCTTC...,1CCCGGGGGGGGCJJJGCJGJJJGJGJJGJGGGGJJJJGCJG=JGG...
7,@277944:ncbi:4-18299,Human coronavirus NL63 scientific name,Alphapapillomavirus_3,9,5,0.025119,TTATTTGGTTTAAAGCCACTATAACACTCAACCCGAGCTCTTGCAG...,CCCGGGGGGCGGGJGGJJGGJJJJJJJJJGGGCGJJGGJJGCJGJJ...
8,@11120:ncbi:5-18400,Infectious bronchitis virus scientific name,Variola_virus,0,8,0.199526,ATAGACAATATGTTGTCGCAAACGGGACCATACTGTTGAAACAACT...,CCCGGGG=CGGGGJJJJJJJJJJJJJGCJJJJJJCJJJJJGJJJGG...
9,@11120:ncbi:5-18399,Infectious bronchitis virus scientific name,Monkeypox_virus,18,3,0.199526,ATTTTGTGGTAGTGGAAGACATGTTCTTTCGATACCACAAAATGCA...,CC=GCGGCGGGGGJJJJJJJCGGJJGJJJJGJCJCGGJGJJ1CGGJ...


In [None]:
df.loc[df.loc[:, 'Predicted Label'].isin([94, 117]), coi]

Unnamed: 0,ID,NCBI Ref Species,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,sequence,Q Score
0,@2591237:ncbi:1-20100,Coronavirus BtRs-BetaCoV/YN2018D scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,2,0.001585,TTTGACTCATGGTTTAGCCAGCGTGGTGGTTCATACAAAAATGACA...,CCCGGGGGGGGGGJGJGJJJGJJJJJJGJGJJJGJJGJJCCGGJGJ...
1,@2591237:ncbi:1-20099,Coronavirus BtRs-BetaCoV/YN2018D scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,9,0.025119,TCACCAAGAGCTCTACTCACCGCTTTTTCTCATTGTTGCTGCTCTA...,CCCCCGC1GGGGCCJJJJJJJJJJJJGGJJJ=GJGJGGJJC8GGCJ...
11,@28295:ncbi:6-18599,Porcine epidemic diarrhea virus scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,8,0.025119,GGTTACTAATGGCCTTGGTACTGTTGATGAAGACTATAAGCGCTGT...,CCC8GCGGG=GGGGJJGJJJCJJCJJJJJJJJJJJGJGJJGJJJJJ...
12,@28295:ncbi:7-18600,Porcine epidemic diarrhea virus scientific name,Middle_East_respiratory_syndrome-related_coron...,94,3,0.199526,AGTGTTCCATCATAATAGAAAGCAAAGCCATCAACAATAGCAATAC...,CCC1GGGGGGGGGGJJGGJJJJJGJJ8J1JJJJGJGJJJCCGJJJG...
13,@28295:ncbi:7-18599,Porcine epidemic diarrhea virus scientific name,Severe_acute_respiratory_syndrome-related_coro...,117,7,0.199526,CACTTTGTGTGAAAGCTTCCACTGCTTATAGCAATGACAAATGTTC...,CCCGGGGGGGC=GCGGJ=GJJJJCJJJJJJG=8GJJJGGJJGJGJJ...


In [None]:
df.loc[~df.loc[:, 'Predicted Label'].isin([94, 117]), coi]

Unnamed: 0,ID,NCBI Ref Species,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,sequence,Q Score
2,@11128:ncbi:2-20600,Bovine coronavirus scientific name,Rotavirus_C,115,1,0.199526,TTTCAAGTCTAGCCGGTGATGAGGGATTGATAGTGATTTTATAGTT...,1C1GGCGGCGCGGJGJGGGJJGJG(JJJJJJJGJCGJJJJJJJJGJ...
3,@11128:ncbi:2-20599,Bovine coronavirus scientific name,Cercopithecine_alphaherpesvirus_9,22,1,0.199526,TATCTGAATACTACAACATTAGCTGTACCTGTTAATATGCGAGTTT...,CC1GGGGGGGGGGJJGJ=GJJGJGGGGJJJCJ8CGGJJJJJJGGGJ...
4,@31631:ncbi:3-20400,Human coronavirus OC43 scientific name,Human_gammaherpesvirus_4,62,8,0.199526,TGGTCTTCACAATAATATGACATACCACCTAGATACAATTTGGTAA...,CCC1GGGGGGGGGJCJGGJJCGJJ11JGJJJGJGGG=JGJ=JGGGG...
5,@31631:ncbi:3-20399,Human coronavirus OC43 scientific name,Human_betaherpesvirus_6B,4,1,0.025119,ATCATACGGTTACTGATGTTAAGCAAGTTGGTTGTTCTATGCGCTT...,C=CGG=GGCGGGGJJGJGGJGJJJJCGGJJJJJGGJG8JJJCJ=JG...
6,@277944:ncbi:4-18300,Human coronavirus NL63 scientific name,Puumala_orthohantavirus,48,2,0.025119,TACCTCATGATCAAATTCAGACTTTGCAATATTCATGGCACGCTTC...,1CCCGGGGGGGGCJJJGCJGJJJGJGJJGJGGGGJJJJGCJG=JGG...
7,@277944:ncbi:4-18299,Human coronavirus NL63 scientific name,Alphapapillomavirus_3,9,5,0.025119,TTATTTGGTTTAAAGCCACTATAACACTCAACCCGAGCTCTTGCAG...,CCCGGGGGGCGGGJGGJJGGJJJJJJJJJGGGCGJJGGJJGCJGJJ...
8,@11120:ncbi:5-18400,Infectious bronchitis virus scientific name,Variola_virus,0,8,0.199526,ATAGACAATATGTTGTCGCAAACGGGACCATACTGTTGAAACAACT...,CCCGGGG=CGGGGJJJJJJJJJJJJJGCJJJJJJCJJJJJGJJJGG...
9,@11120:ncbi:5-18399,Infectious bronchitis virus scientific name,Monkeypox_virus,18,3,0.199526,ATTTTGTGGTAGTGGAAGACATGTTCTTTCGATACCACAAAATGCA...,CC=GCGGCGGGGGJJJJJJJCGGJJGJJJJGJCJCGGJGJJ1CGGJ...
10,@28295:ncbi:6-18600,Porcine epidemic diarrhea virus scientific name,Human_gammaherpesvirus_8,32,1,0.199526,CTACGGCTCCACACTTTTCAAGTCTGCTATAGATGGATTGCTCGTG...,CCCGGGGGGGGGGJJJ=JJJGGJJJGJJJGGJGJGCJJ8JJJGJJC...
14,@28295:ncbi:8-18600,Porcine epidemic diarrhea virus scientific name,Primate_erythroparvovirus_2,106,4,0.025119,TTAACACGTTACGTAGGGGTGCTGTTCTCGGCTACATAGGTGCCAC...,CCCCCGGGGGGGGJGJJGJJJGGJJGJJJJJJGJJJGJJJJ8JJJG...


# New Section

## end of section