# Inference from fastq file including k-mer reads

This notebook provides a reference implementation of the inference applied on k-mer reads, stored in a simple fastq file. The notebook uses simreads generated from NCBI CoV sequences.

# 1. Imports and setup environment

In [None]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [None]:
# Import all required packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

from datetime import datetime
from ecutilities.core import files_in_tree
from ecutilities.ipython import nb_setup
from functools import partial
from IPython.display import display, update_display, Markdown, HTML
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm, trange

# Setup the notebook for development
nb_setup()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.models import load_model
print(f"Tensorflow version: {tf.__version__}\n")

from metagentools.cnn_virus.data import _base_hot_encode, split_kmer_into_50mers, combine_predictions
from metagentools.cnn_virus.data import FastqFileReader, AlnFileReader
from metagentools.cnn_virus.data import OriginalLabels
from metagentools.cnn_virus.architecture import create_model_original
from metagentools.core import ProjectFileSystem, TextFileBaseReader

Set autoreload mode
Tensorflow version: 2.8.2



List all computing devices available on the machine

In [None]:
devices = device_lib.list_local_devices()
print('\nDevices:')
for d in devices:
    t = d.device_type
    name = d.physical_device_desc
    l = [item.split(':', 1) for item in name.split(', ')]
    name_attr = dict([x for x in l if len(x)==2])
    dev = name_attr.get('name', ' ')
    print(f"  - {t}  {d.name} {dev:25s}")


Devices:
  - CPU  /device:CPU:0                          
  - GPU  /device:GPU:0  NVIDIA GeForce GTX 1050 


# 2. Setup paths to files

Key folders and system information

In [None]:
pfs = ProjectFileSystem()
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


In [None]:
# pfs.set_project_root('/home/vtec/projects/bio/metagentools');

- `p2model`: path to file with saved original pretrained model
- `p2virus_labels` path to file with virus names and labels mapping for original model
- `p2simreads`: path to folder where reads files are located (FASTQ and ALN)

In [None]:
p2model = pfs.data / 'saved/cnn_virus_original/pretrained_model.h5'
assert p2model.is_file(), f"No file found at {p2model.absolute()}"

p2virus_labels = pfs.data / 'CNN_Virus_data/virus_name_mapping'
assert p2virus_labels.is_file(), f"No file found at {p2virus_labels.absolute()}"

# p2fastq = pfs.data / 'ncov_data/reads/yf/yf-reads-10.fq'
p2fastq = pfs.data / 'ncbi/simreads/cov/single_1seq_150bp/single_1seq_150bp.fq'
assert p2fastq.is_file(), f"No file found at {p2fastq.absolute()}"

In [None]:
files_in_tree(path=p2fastq.parent);

cov
  |--single_1seq_150bp
  |    |--single_1seq_150bp.fq (0)
  |    |--single_1seq_150bp.aln (1)


# 3. Load data and review

In [None]:
fastq = FastqFileReader(p2fastq)
# aln = AlnFileReader(p2fastq.with_suffix('.aln'))

In [None]:
fastq.reset_iterator()
for i, fq_read in enumerate(fastq):
    seq = fq_read['sequence']
    defline = fq_read['definition line']

print(f"This file includes {i+1:,d} reads, with the following format:\n")
print(f"{defline}\n{seq[:60]} ... ({len(seq)} bp)")

This file includes 40,200 reads, with the following format:

@2591237:ncbi:1-1
ATGTGACTCCATTGACACTAGCTTGTGCTGGTCCTTTTGAAGGTGTTAAACCTTTAACTG ... (150 bp)


# 4. Prediction Loop

- load model
- define output file
- iterate through the fastq file, read by read
    - get read k-mer sequence
    - base encode k-mer sequence and preprocess into (k-50+1) 50-mer sequences
    - prediction for each 50-mer sequence
    - combine all predictions into a single label and position prediction
    - add prediction and metadata to the output file
    

In [None]:
model = create_model_original(path2parameters=p2model)
# model.summary()

Create class to convert original labels into species name

In [None]:
labels = OriginalLabels()
for n in [94, 117, 118]:
    print(labels.label2species(n))

Middle_East_respiratory_syndrome-related_coronavirus
Severe_acute_respiratory_syndrome-related_coronavirus
Yellow_fever_virus


In [None]:
fastq.reset_iterator()
for fq in fastq:
    print(fq['sequence'])
    print(len(fq['sequence']))
    pass
fq.keys()

CNCACCCAAAGGATGCCCTTTGGACCTCAGTAATGAGACTTCCATCATTTTTCAGTGGTTGATGGATGACTTGGAGGCTGATCTAATAGTTTCAGTATTTTATTCTTCATTCAGTCAGGGATCACAGCTTGAAAACTTCCATCTCTAGTC
150
CNTTTTTATGCTTTTTTGTATCTGTTTTAAAAATATTTTTATATATTTGTTTACTTATTTTATGTATCTCAGTACACTGTTGCTATCTTTAGACGATTCTTTGAAGAGGGCATTGAATCCCATTACAGATGGTTGTGAGCCACCATGTAG
150
CNATCTCTCTGTCTCTGTGTTTCTCTCTCTATGTCTCTCCATCTCTGTCTCTACATTTCGGTCTCTATCTCTCTGTGTTTGCCTGTCTCCATCGTCTCTCTGTGTCTCTGTGTCTCTGTTTCTCTGTTTCTGTCTCTCTGTCTCCCTTTC
150
GTCAGAGTACTCTCTGCAGGCAAGCTCTCCTCTTGCAGGGAAGGTGCCAAGATATCTGGTGTTTGAACCTGCCTCCTGGCAGAAGTTGTGTTCCACTCACCAGAGGTCCTATGATCCTGTGGAGAGTCCTCTGGGGACTTTGCACCCAAG
150
GGTGGCCCAGGTGTGGGCGGAGGGGGTTGTCGCGGCGTGGATCGGAGGCACTGGCTCTCAGAATGCAAGGCTAAGCAGTCCTATGTGCGGGCGTTGACTGCAGACTCCCAGGGCCGCGTAGGCTGGCGCTGGATTCGGATCGACACAGCT
150
CTTCCATGAAGCCTTCCCTGATCTGCTCTGCTTTTCTCTGGAAGAATTAACCAGTTCTTTCTTCATCTATAATCTCCTACTGGAGTCTTTCACAGAAGGATGCACTTACGTTCCGGGCTCCTCTTCCTCCTAGTGGTGGTGAATAAAGAT
150
TNAAAGCTGTCCACTGTGTTGGATATCAAAATATTTACCTCTCCCAACTTGAATAACCACTATTAAAATT

dict_keys(['definition line', 'sequence', 'read_qscores', 'probs error'])

In [None]:
# dataset_reference = 'NCBI Simulated CoV Reads'
dataset_reference = 'YF'

fname_seed = p2fastq.stem
p2results = pfs.data / f"ncov_data/infer_results/{fname_seed}_results_{datetime.now().strftime('%Y%m%d_%Hh%Mm%S')}.csv"
print(f"Results will be saved into {p2results.absolute()}")

fastq.reset_iterator()

with open(p2results, 'a') as fp:
    line = 'ID\tSequence\tQ Score\tMax Prob Error\tPredicted Label\tPredicted Species\tPredicted Position\tDataset Reference\n'
    fp.write(line)
    for i, fqelement in tqdm(enumerate(fastq)):
        # Get k-mer read and some metadata
        readid = fqelement['definition line']
        read = fqelement['sequence']
        read_qscores = fqelement['read_qscores']
        max_prob_error = fqelement['probs error'].max()

        # Split one k-mer read into 50-mer reads
        bhe_kmer, _, _ = _base_hot_encode(f"{read}\t0\t0")
        bhe_50mer = split_kmer_into_50mers(kmer=bhe_kmer)

        # Predict labels and positions for each 50-mer reads and combine
        label_probs, pos_probs = model.predict(bhe_50mer)
        label_preds = tf.argmax(label_probs, axis=-1)
        pos_preds = tf.argmax(pos_probs, axis=-1) 
        combined_label, combined_pos = combine_predictions(label_preds, label_probs, pos_preds)

        line = f"{readid}\t{read}\t{read_qscores}\t{max_prob_error}\t{combined_label}\t{labels.label2species(combined_label)}\t{combined_pos}\t{dataset_reference}\n"
        fp.write(line)
        # if i >= 5: break

Results will be saved into /home/vtec/projects/bio/metagentools/data/ncov_data/infer_results/yf-reads-10_results_20240131_18h54m19.csv


0it [00:00, ?it/s]

In [None]:
df = pd.read_csv(p2results, sep='\t')
coi = ['ID', 'Predicted Species', 'Predicted Label', 'Predicted Position', 'Max Prob Error', 'Sequence', 'Q Score', 'Dataset Reference']
df.loc[:, coi]

Unnamed: 0,ID,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,Sequence,Q Score,Dataset Reference
0,@A00551:791:HFLNGDSX7:1:1101:3441:1031 1:N:0:C...,Isfahan_vesiculovirus,104,1,0.630957,CNCACCCAAAGGATGCCCTTTGGACCTCAGTAATGAGACTTCCATC...,F#FFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF...,YF
1,@A00551:791:HFLNGDSX7:1:1101:14642:1047 1:N:0:...,Variola_virus,0,0,0.630957,CNTTTTTATGCTTTTTTGTATCTGTTTTAAAAATATTTTTATATAT...,F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
2,@A00551:791:HFLNGDSX7:1:1101:20699:1047 1:N:0:...,Human_betaherpesvirus_6A,12,9,0.630957,CNATCTCTCTGTCTCTGTGTTTCTCTCTCTATGTCTCTCCATCTCT...,F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFF...,YF
3,@A00551:791:HFLNGDSX7:1:1101:15194:1063 1:N:0:...,Human_gammaherpesvirus_4,62,9,0.003162,GTCAGAGTACTCTCTGCAGGCAAGCTCTCCTCTTGCAGGGAAGGTG...,FFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
4,@A00551:791:HFLNGDSX7:1:1101:16839:1063 1:N:0:...,Human_gammaherpesvirus_8,32,0,0.003162,GGTGGCCCAGGTGTGGGCGGAGGGGGTTGTCGCGGCGTGGATCGGA...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
5,@A00551:791:HFLNGDSX7:1:1101:17562:1063 1:N:0:...,Human_respirovirus_3,84,1,0.079433,CTTCCATGAAGCCTTCCCTGATCTGCTCTGCTTTTCTCTGGAAGAA...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
6,@A00551:791:HFLNGDSX7:1:1101:29658:1063 1:N:0:...,Severe_acute_respiratory_syndrome-related_coro...,117,1,0.630957,TNAAAGCTGTCCACTGTGTTGGATATCAAAATATTTACCTCTCCCA...,"F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF,FF...",YF
7,@A00551:791:HFLNGDSX7:1:1101:15420:1078 1:N:0:...,Human_gammaherpesvirus_8,32,3,0.003162,GGAGTTTGAGGCAAGCCTGGGATACATAGAACCTATCCCAAAACAG...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
8,@A00551:791:HFLNGDSX7:1:1101:30789:1078 1:N:0:...,Human_gammaherpesvirus_4,62,7,0.079433,CTCAGGACCTGCGGTAGTCTGGAAGACTGGCCGTGAAGTCCTGCCC...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFF...,YF
9,@A00551:791:HFLNGDSX7:1:1101:1633:1094 1:N:0:C...,Influenza_D_virus,111,6,0.079433,GAGGGAAGTTGTTATGAAATTTTGCTTTGGTCCTATAAATGTCCCT...,"FFF,FFFFFFFFFFFFFFFFFFFF:FFF,FFFFFFFF:FFFFFFFF...",YF


In [None]:
# df.loc[df.loc[:, 'Predicted Label'].isin([94, 117]), coi].reset_index()
df.loc[df.loc[:, 'Predicted Label'].isin([118]), coi].reset_index()

Unnamed: 0,index,ID,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,Sequence,Q Score,Dataset Reference


In [None]:
# df.loc[~df.loc[:, 'Predicted Label'].isin([94, 117]), coi].reset_index()
df.loc[~df.loc[:, 'Predicted Label'].isin([118]), coi].reset_index()

Unnamed: 0,index,ID,Predicted Species,Predicted Label,Predicted Position,Max Prob Error,Sequence,Q Score,Dataset Reference
0,0,@A00551:791:HFLNGDSX7:1:1101:3441:1031 1:N:0:C...,Isfahan_vesiculovirus,104,1,0.630957,CNCACCCAAAGGATGCCCTTTGGACCTCAGTAATGAGACTTCCATC...,F#FFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFFFFFFF...,YF
1,1,@A00551:791:HFLNGDSX7:1:1101:14642:1047 1:N:0:...,Variola_virus,0,0,0.630957,CNTTTTTATGCTTTTTTGTATCTGTTTTAAAAATATTTTTATATAT...,F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
2,2,@A00551:791:HFLNGDSX7:1:1101:20699:1047 1:N:0:...,Human_betaherpesvirus_6A,12,9,0.630957,CNATCTCTCTGTCTCTGTGTTTCTCTCTCTATGTCTCTCCATCTCT...,F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFFFFFFFFFFF...,YF
3,3,@A00551:791:HFLNGDSX7:1:1101:15194:1063 1:N:0:...,Human_gammaherpesvirus_4,62,9,0.003162,GTCAGAGTACTCTCTGCAGGCAAGCTCTCCTCTTGCAGGGAAGGTG...,FFFFF:FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
4,4,@A00551:791:HFLNGDSX7:1:1101:16839:1063 1:N:0:...,Human_gammaherpesvirus_8,32,0,0.003162,GGTGGCCCAGGTGTGGGCGGAGGGGGTTGTCGCGGCGTGGATCGGA...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
5,5,@A00551:791:HFLNGDSX7:1:1101:17562:1063 1:N:0:...,Human_respirovirus_3,84,1,0.079433,CTTCCATGAAGCCTTCCCTGATCTGCTCTGCTTTTCTCTGGAAGAA...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
6,6,@A00551:791:HFLNGDSX7:1:1101:29658:1063 1:N:0:...,Severe_acute_respiratory_syndrome-related_coro...,117,1,0.630957,TNAAAGCTGTCCACTGTGTTGGATATCAAAATATTTACCTCTCCCA...,"F#FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF,FF...",YF
7,7,@A00551:791:HFLNGDSX7:1:1101:15420:1078 1:N:0:...,Human_gammaherpesvirus_8,32,3,0.003162,GGAGTTTGAGGCAAGCCTGGGATACATAGAACCTATCCCAAAACAG...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF...,YF
8,8,@A00551:791:HFLNGDSX7:1:1101:30789:1078 1:N:0:...,Human_gammaherpesvirus_4,62,7,0.079433,CTCAGGACCTGCGGTAGTCTGGAAGACTGGCCGTGAAGTCCTGCCC...,FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:FFFFF...,YF
9,9,@A00551:791:HFLNGDSX7:1:1101:1633:1094 1:N:0:C...,Influenza_D_virus,111,6,0.079433,GAGGGAAGTTGTTATGAAATTTTGCTTTGGTCCTATAAATGTCCCT...,"FFF,FFFFFFFFFFFFFFFFFFFF:FFF,FFFFFFFF:FFFFFFFF...",YF


# New Section

## end of section