# Experiments for Inference and Analysis NCBI Yellow Fever

TBC

# 1. Imports and setup environment

In [2]:
# Install required custom packages if not installed yet.
import importlib.util
if not importlib.util.find_spec('ecutilities'):
    print('installing package: `ecutilities`')
    ! pip install -qqU ecutilities
else:
    print('`ecutilities` already installed')
if not importlib.util.find_spec('metagentools'):
    print('installing package: `metagentools')
    ! pip install -qqU metagentools
else:
    print('`metagentools` already installed')

`ecutilities` already installed
`metagentools` already installed


In [4]:
# Import all required packages
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import re
import time

from datetime import datetime
from ecutilities.core import files_in_tree
from ecutilities.ipython import nb_setup
from functools import partial
from IPython.display import display, update_display, Markdown, HTML
from pandas import HDFStore
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm, trange

# Setup the notebook for development
nb_setup()

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.models import load_model
print(f"Tensorflow version: {tf.__version__}\n")

from metagentools.cnn_virus.data import _base_hot_encode, strings_to_tensors
from metagentools.cnn_virus.data import split_kmer_into_50mers, combine_predictions
from metagentools.cnn_virus.data import FastaFileReader, FastqFileReader, AlnFileReader
from metagentools.cnn_virus.data import OriginalLabels
from metagentools.cnn_virus.data import string_input_batch_to_tensors, split_kmer_batch_into_50mers
from metagentools.cnn_virus.architecture import create_model_original
from metagentools.core import ProjectFileSystem, TextFileBaseReader

Set autoreload mode
Tensorflow version: 2.8.2



List all computing devices available on the machine

In [5]:
devices = device_lib.list_local_devices()
print('\nDevices:')
for d in devices:
    t = d.device_type
    name = d.physical_device_desc
    l = [item.split(':', 1) for item in name.split(', ')]
    name_attr = dict([x for x in l if len(x)==2])
    dev = name_attr.get('name', ' ')
    print(f"  - {t}  {d.name} {dev:25s}")


Devices:
  - CPU  /device:CPU:0                          
  - GPU  /device:GPU:0  NVIDIA GeForce GTX 1050 


# 2. Setup paths to files

Key folders and system information

In [6]:
pfs = ProjectFileSystem()
pfs.info()

Running linux on local computer
Device's home directory: /home/vtec
Project file structure:
 - Root ........ /home/vtec/projects/bio/metagentools 
 - Data Dir .... /home/vtec/projects/bio/metagentools/data 
 - Notebooks ... /home/vtec/projects/bio/metagentools/nbs


- `p2model`: path to file with saved original pretrained model
- `p2virus_labels` path to file with virus names and labels mapping for original model

In [7]:
p2model = pfs.data / 'saved/cnn_virus_original/pretrained_model.h5'
assert p2model.is_file(), f"No file found at {p2model.absolute()}"

p2virus_labels = pfs.data / 'CNN_Virus_data/virus_name_mapping'
assert p2virus_labels.is_file(), f"No file found at {p2virus_labels.absolute()}"

p2fasta = pfs.data / 'ncbi/refsequences/yf/yf_2023_yellow_fever.fa'
assert p2fasta.is_file(), f"No file found at {p2fasta.absolute()}"

# 3. Load reference sequence data and review

In [8]:
fasta = FastaFileReader(p2fasta);

In [9]:
fasta.review()

There are 69 sequences in this file

First Sequence:
>11089:ncbi:1	1	AY968064	11089	ncbi	Angola_1971
ATGTCTGGTCGAAAAGCTCAGGGTAAAACCCTGGGCGTCAATATGGTAAGACGAGGGGTTCGCTCCTTGTCAAACAAAAT ...
{'accession': 'AY968064', 'organism': 'Angola_1971', 'seqid': '11089:ncbi:1', 'seqnb': '1', 'source': 'ncbi', 'taxonomyid': '11089'}

Last Sequence:
>11089:ncbi:69	69	OM066737	11089	ncbi	VHF-21-037/GHA/Damongo/2021
ATGTCTGGTCGTAAAGCTCAGGGCAAAACCCTGGGCGTCAATATGGTACGACGAGGAGTCCGCTCCNNNNNNNNNAAAAT ...
{'accession': 'OM066737', 'organism': 'VHF-21-037/GHA/Damongo/2021', 'seqid': '11089:ncbi:69', 'seqnb': '69', 'source': 'ncbi', 'taxonomyid': '11089'}


69

# 4. Load simulated reads and review

Check which simread files are already created:

In [10]:
files_in_tree(pfs.data / 'ncbi/simreads/yf');

simreads
  |--yf
  |    |--readme.md (0)
  |    |--single_1seq_150bp
  |    |    |--single_1seq_150bp.fq (1)
  |    |    |--single_1seq_150bp.aln (2)
  |    |--single_69seq_150bp
  |    |    |--single_69seq_150bp.fq (3)
  |    |    |--single_69seq_150bp.aln (4)
  |    |--paired_1seq_150bp
  |    |    |--paired_1seq_150bp2.aln (5)
  |    |    |--paired_1seq_150bp2.fq (6)
  |    |    |--paired_1seq_150bp1.fq (7)
  |    |    |--paired_1seq_150bp1.aln (8)
  |    |--paired_69seq_150bp
  |    |    |--paired_69seq_150bp1.fq (9)
  |    |    |--paired_69seq_150bp2.fq (10)
  |    |    |--paired_69seq_150bp1.aln (11)
  |    |    |--paired_69seq_150bp2.aln (12)


In [11]:
p2fastq = pfs.data / 'ncbi/simreads/yf/single_1seq_150bp/single_1seq_150bp.fq'
assert p2fastq.exists()
p2aln = pfs.data / 'ncbi/simreads/yf/single_1seq_150bp/single_1seq_150bp.aln'
assert p2aln.exists()

In [12]:
fastq = FastqFileReader(p2fastq)
for i, fq_read in enumerate(fastq):
    pass
print(f"{i:,d} simulated reads")

16,999 simulated reads


In [13]:
aln = AlnFileReader(p2aln)
for i, aln_read in enumerate(aln):
    pass
print(f"{i:,d} simulated reads")

16,999 simulated reads


In [14]:
print(aln.header['command'])
print('\n'.join(aln.header['reference sequences']))

/usr/bin/art_illumina -i /home/vtec/projects/bio/metagentools/data/ncbi/refsequences/yf/yf_1971_angola.fa -ss HS25 -l 150 -f 250 -o /home/vtec/projects/bio/metagentools/data/ncbi/simreads/yf/single_1seq_150bp/single_1seq_150bp -rs 1724156163
@SQ	11089:ncbi:1	1	AY968064	11089	ncbi	Angola_1971	10237


# Prediction Loop

## Define Functions

In [16]:
[(k,v) for k, v in OriginalLabels()._species2label.items() if 'yellow' in k.lower()]

[('Yellow_fever_virus', 118)]

In [19]:
aln.reset_iterator()
for batch in aln.cnn_virus_input_generator(bs=8, label=118):
    kmer_tensor, (label_tensor, position_tensor) = string_input_batch_to_tensors(batch, k=150)
    break

kmer_tensor.shape, label_tensor.shape, position_tensor.shape

(TensorShape([8, 150, 5]), TensorShape([8, 187]), TensorShape([8, 10]))

In [20]:
split_kmer_batch_into_50mers(kmer_tensor).shape

TensorShape([808, 50, 5])

In [22]:
kmer_tensor.shape, (kmer_tensor.shape[1]-49) * kmer_tensor.shape[0]

(TensorShape([8, 150, 5]), 808)

In [23]:
def combine_label_predictions(probs_elements, prob_threshold=0.9):
    """???"""

    label_probs = probs_elements[0]
    position_probs = probs_elements[1]

    labels_preds = tf.argmax(label_probs, axis=1)
    positions_preds = tf.argmax(position_probs, axis=1)

    valid_labels_filter = tf.reduce_max(label_probs, axis=1) > prob_threshold
    valid_labels_preds = labels_preds[valid_labels_filter]
    
    valid_positions_preds = positions_preds[valid_labels_filter]


    if valid_labels_preds.shape[0] == 0:
        combined_label = tf.constant(187, shape=(1,), dtype=tf.int64)
        combined_position = tf.constant(10, shape=(1,), dtype=tf.int64)

    else:
        uniques, _, counts = tf.unique_with_counts(valid_labels_preds)
        combined_label = uniques[tf.argmax(counts)]

        # filter which reads give the majority label prediction
        combined_label_filter = valid_labels_preds == combined_label

        # pick the corresponding position predictions
        filtered_positions = valid_positions_preds[combined_label_filter]
        unique_positions, _, counts = tf.unique_with_counts(filtered_positions)
        combined_position = unique_positions[tf.argmax(counts)]

        combined_pred = tf.concat([combined_label, combined_position], axis=0)

    return combined_pred

In [24]:
def top_predictions(probs, n=3):

    def top_n_most_frequent(preds, n=3):
        """Returns the top n most frequent predictions for each read"""
        # print(preds.shape)
        uniques, counts = np.unique(preds, return_counts=True)
        top_idx = np.argsort(counts)[-n:]
        return uniques.take(top_idx)

    top_preds_in_50mers = np.argsort(probs, axis=-1)[:, :, -n:]
    nb_seq, nb_50mer, nb_lbls = top_preds_in_50mers.shape
    # print(top_preds_in_50mers.shape)
    top_preds_in_kmer = top_preds_in_50mers.reshape(nb_seq,nb_50mer * nb_lbls)
    # print(top_preds_in_kmer.shape)

    return np.apply_along_axis(top_n_most_frequent, axis=1, arr=top_preds_in_kmer, n=n)

# top_predictions(label_probs_per_kmer, n=3)

In [25]:
def count_successive_label_preds(label_probs_per_kmer):
    series = []
    label_preds_per_kmer = tf.argmax(label_probs_per_kmer, axis=2)
    same_as_next = tf.roll(label_preds_per_kmer, shift=-1, axis=1) == label_preds_per_kmer
    same_as_previous = tf.roll(label_preds_per_kmer, shift=+1, axis=1) == label_preds_per_kmer
    xor = tf.bitwise.bitwise_xor(tf.cast(same_as_next, dtype=tf.int16), tf.cast(same_as_previous, dtype=tf.int16))
    for i in range(b):
        kmer_series_idxs = tf.range(k-49)[tf.cast(xor[i], dtype=tf.bool)]
        start_idxs = kmer_series_idxs[0::2]
        end_idxs = kmer_series_idxs[1::2]
        kmer_series = {key: [] for key in range(187)}
        for s,e in zip(start_idxs, end_idxs):
            kmer_series[label_preds_per_kmer[i,s].numpy().item()].append((e-s+1).numpy().item())
        series.append(kmer_series)
    return series

# Run the Loop

Before running the next cell, define:
- size of the batch `b`
- the number of bp in a k-mer
- the number n for top-n predictions to save
- whether to run the full loop or only a few batches

In [26]:
fastq = FastqFileReader(p2fastq);
aln = AlnFileReader(p2aln)
p2fastq.stem

'single_1seq_150bp'

In [None]:
b = 1024     # number of k-mer in a batch
k = 150
top_n = 5   # n for top-n prediction to keep
run_all_batches = False
nb_batches_to_run = 2

uid = datetime.today().strftime('%Y-%m-%d_%H_%M_%S')

p2result_folder = pfs.data /'ncbi/infer_results/yf-ncbi'
p2results = p2result_folder / f"{p2fastq.stem}-{uid}-results.csv"
p2probs = p2result_folder / f"{p2fastq.stem}-{uid}-probs.csv"

nb_50mer = k - 49
print(f"Run prediction loop with the following parameters:")
print(f"   {b} k-mer per batch; {k} bp per sequence; keep top-{top_n} predictions")

fastq.reset_iterator()
aln.reset_iterator()
model = create_model_original(path2parameters=p2model)

# create a dataframe to store results
pred_cols = ['lbl_pred','pos_pred']
top_pred_cols = [f"top_{top_n}_lbl_pred_{i}" for i in range(top_n)]
prob_cols = [f"R{i}ProbL{j}" for i in range(k-49) for j in range(187)]
df = pd.DataFrame(
    columns=pred_cols + top_pred_cols + ['series']
)

df_probs = pd.DataFrame(
    columns= prob_cols,
    dtype=float
)

df.to_csv(p2results)
df_probs.to_csv(p2probs)
print(f"results will be saved to {p2results.absolute()} and \nprobs will be saved to {p2probs.absolute()}")

batch_seq_strings = []
batch_seq_refs = []
batch_nb = 1
print(f"Batch {batch_nb:3d} ...")
load_seq_starts = time.time()

for i,fqelement in enumerate(fastq):
    seq = fqelement['sequence'][:k]
    batch_seq_strings.append(f"{seq}\t0\t0")
    batch_seq_refs.append(fqelement['definition line'])
 
    # After reading b sequences from the fq file, make a prediction and load results in a dataframe
    if (i+1) % b == 0:
        pred_starts = time.time()
        print(f"   {b} sequences loaded in {time.time() - load_seq_starts:.2f} s")
        print(f"   prediction for batch {int((i+1)/b)} starting ...")
        # Prediction
        strings_ts = tf.convert_to_tensor(batch_seq_strings)
        seqs_kmer, (labels, positions) = base_string_kmers_to_tensors(strings_ts, k)
        last_step = pred_starts
        current_step = time.time()
        # print(f"       str to tensor: {current_step - last_step:.2f} s at {datetime.now().strftime('%H:%M:%S')}")

        seqs_50mer = split_kmer_batch_into_50mers(seqs_kmer)
        last_step = current_step
        current_step = time.time()
        # print(f"       split_kmer: {current_step - last_step:.2f} s at {datetime.now().strftime('%H:%M:%S')}")

        labels_probs, positions_probs = model.predict(seqs_50mer)
        last_step = current_step
        current_step = time.time()
        print(f"       model predict: {current_step - last_step:.2f} s at {datetime.now().strftime('%H:%M:%S')}")

        label_probs_per_kmer = tf.reshape(labels_probs, shape=(b,nb_50mer,-1))
        position_probs_per_kmer = tf.reshape(positions_probs, shape=(b,nb_50mer,-1))
        last_step = current_step
        current_step = time.time()
        # print(f"       reshape: {current_step - last_step:.2f} s at {datetime.now().strftime('%H:%M:%S')}")

        successive_preds = count_successive_label_preds(label_probs_per_kmer)

        combined_predictions = tf.map_fn(
            fn=combine_label_predictions, 
            elems=[label_probs_per_kmer, position_probs_per_kmer], 
            fn_output_signature=tf.int64
            )
        last_step = current_step
        current_step = time.time()
        print(f"       combine:       {current_step - last_step:.2f} s at {datetime.now().strftime('%H:%M:%S')}")

        label_predictions = combined_predictions[:,0]
        position_predictions = combined_predictions[:,1]
        top_preds = top_predictions(label_probs_per_kmer, n=top_n)

        # Add results for batch
        res = np.concatenate(
            [
                np.expand_dims(label_predictions.numpy(), axis=1),
                np.expand_dims(position_predictions.numpy(), axis=1),
                top_preds[:, ::-1],
                np.expand_dims(np.array(successive_preds), axis=1)
            ],
            axis=1
        )
        df = pd.DataFrame(data=res, index=batch_seq_refs, columns=df.columns)
        df.to_csv(p2results, mode='a', header=False)
        
        df_probs = pd.DataFrame(label_probs_per_kmer.numpy().reshape(b, -1), index=batch_seq_refs, columns=df_probs.columns)
        df_probs.to_csv(p2probs, mode='a', header=False)
        
        # Reset batch
        batch_seq_strings = []
        batch_seq_refs = []
        print(f"   prediction done in {time.time() - pred_starts:.2f} s")
        batch_nb += 1
        print(f"Batch {batch_nb:3d} ...")
        load_seq_starts = time.time()

    # Stop after a few batches
    if not run_all_batches and (i+1) >= b * nb_batches_to_run: 
        break

print('Done')
print(f"Predicted virus class for {df.shape[0]} {k}-mer sequences")

# Analysis Ideas

Other Ideas:
- Analysing the False Negatives (top-k)
- Can we identify a False negative vs a True Positive from the probabilities of all 101 50-mers?

Generate features from extracted series lengths, for each label or the top k labels evalaute for each k-mer
- number of series
- average length
- maximum length
- cummulative lengths of series ($\sum_{series}{}$ lengths of serie)
- standard deviation of length

Generate statistics feature over all 50-mer for each k-mer (for all labels or the top k):
- min probability across 101 50-mer
- max probability
- mean probability
- standard deviation probability
- count 50-mer probability in given quantile [(pd doc)](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.quantile.html)


Generate bio features:
- position of read for TP and FN
- mapping with the probabilities

## Analyse accuracy per accession

In [None]:
p2result_folder = pfs.data /'ncbi/infer_results/yf'
files_in_tree(path=p2result_folder);

In [None]:
# p2resfile = pfs.project_root /'data/ncov_data/reads/yf/yf-mapped-results-2024-04-29_17_31_10.csv'
p2resfile = p2result_folder /'yf_2023-single-69seq-150bp-2024-05-02_16_29_18-results.csv'
assert p2resfile.is_file()

df = pd.read_csv(p2resfile, index_col=0)
print(df.shape)
df.head()

In [None]:
lbls = OriginalLabels(p2mapping=p2virus_labels)
lbls.label2species(118)

In [None]:
df.loc[df.lbl_pred == 118, :].shape

In [None]:
29367 / df.shape[0]

In [None]:
df['accession'] = df.index
pattern = r'^@(?P<refseqid>(?P<refseq_accession>[a-zA-Z0-9]+).*)-(?P<readnb>\d+)$'
df['accession'] = df.loc[:, 'accession'].str.replace(pattern, r'\g<refseq_accession>', regex=True)
df.head(3)

In [None]:
by_accession = df.groupby('accession', sort=True)

In [None]:
count_by_accession = by_accession['lbl_pred'].count()
count_by_accession

In [None]:
correct_by_accession = (df.loc[df.lbl_pred == 118, :]).groupby('accession')
count_correct_by_accession = correct_by_accession['lbl_pred'].count()
count_correct_by_accession

In [None]:
accuracy_by_accession = count_correct_by_accession / count_by_accession
accuracy_by_accession

In [None]:
display(accuracy_by_accession.sort_values(ascending=True)[:20])
display(accuracy_by_accession.sort_values(ascending=False)[:20])

In [None]:
read_ids = list(df.index)
pattern = r'^@(?P<refseqid>(?P<refseq_accession>[a-zA-Z0-9]+).*)-(?P<readnb>\d+)$'

ids_set = set([re.sub(pattern, r'\g<refseqid>', s) for s in read_ids])
accessions_set = set([re.sub(pattern, r'\g<refseq_accession>', s) for s in read_ids])

In [None]:
len(ids_set), len(accessions_set)

In [None]:
mapping_accession_refseq_id = {}
for accession in accessions_set:
    temp_accession = None
    for read_id in ids_set:
        if accession in read_id:
            if temp_accession is None:
                temp_accession = accession
                mapping_accession_refseq_id[accession] = read_id
                continue
            else:
                print(f"{accession} in two read ids: {temp_accession} and {read_id}")
                continue
mapping_accession_refseq_id

In [None]:
accuracy_by_accession.index = [mapping_accession_refseq_id[idx] for idx in accuracy_by_accession.index]

In [None]:
accuracy_by_accession.sort_values(ascending=False)[:33]

In [None]:
accuracy_by_accession.sort_values(ascending=True)[:20]

## Analyse same prediction series

Define the top k predicted label

In [None]:
cols = 'top_5_lbl_pred_0 top_5_lbl_pred_1 top_5_lbl_pred_2 top_5_lbl_pred_3 top_5_lbl_pred_4'.split(' ')
uniques, counts = np.unique(df.loc[:, cols].values.reshape(5 * df.shape[0], -1), return_counts=True)
uniques, counts

In [None]:
sorted_idxs = np.flip(np.argsort(counts))

In [None]:
k = 15
top_k_lbl = uniques[sorted_idxs[:k]].tolist()
coverage = (counts[sorted_idxs[:k]]/counts.sum()).sum()
print(f"The following {k} labels represents {coverage*100:2.0f}% of all predicted labels")
print(top_k_lbl)

In [None]:
json

In [None]:
def extract_top_k_lengths(d, top_k):
    top_k_dict = {}
    for lbl in top_k:
        top_k_dict[lbl] = d.get(lbl, [])
    return top_k_dict

extract_top_k_lengths(df.series.to_list()[0], top_k_lbl)

## Analyse prediction probabilities

### Define functions

In [None]:
def plot_label_probs(probs, top_preds, targets):
    nb_seq, _ , nb_lbls = probs.shape
    assert nb_lbls == 187
    fig = plt.figure(figsize=(16, 2 * nb_seq - 1));
    # fig.suptitle('Suptitle')
    axs = []
    for i in range(nb_seq):
        if top_preds[i, 0] in targets: cmap = 'YlGn_r'
        else: cmap = 'YlOrRd_r'
        axs.append(plt.subplot2grid((nb_seq,1), (i,0), rowspan=1, colspan=1))
        # axs[i].axis('off')
        axs[i].imshow(probs[i,:,:], cmap=cmap, aspect='auto')
        axs[i].set_title(f"Seq {i+1} - Pred: {top_preds[i,0]} - Top Predictions; {top_preds[i]}")

    fig.subplots_adjust(top=1.5, bottom=0.2, hspace=0.5)
    plt.show()

## Handle entire result file

Next cell load the entire result file. It takes a while to load.

If the file is too big, it may fail. In that case, load the file in chunks and handle each chunck separately (see lower cells).

In [None]:
paths = files_in_tree(path=p2fastq.parent)

In [None]:
paths[2]

In [None]:
# p2resfile = pfs.project_root /'data/ncov_data/reads/yf/yf-mapped-results-2024-04-29_17_31_10.csv'
p2resfile = paths[2]

df = pd.read_csv(p2resfile, index_col=0)
print(df.shape)
df.head()

In [None]:
# p2probsfile = pfs.project_root /'data/ncov_data/reads/yf/yf-mapped-probs-2024-04-29_17_27_31.csv'
# p2probsfile = paths[5]
fnbr = 0
assert paths[fnbr].is_file()

print(f"Reading file {paths[fnbr]}")
df_probs = pd.read_hdf(paths[fnbr], key='df')
print(df_probs.shape)
df_probs.head()

In [None]:
df_probs.index

In [None]:
p2hdf5 = paths[fnbr].parent / f"{paths[fnbr].stem}.h5"
assert p2hdf5.suffix == '.h5'
df_probs.to_hdf(p2hdf5, key='df', mode='w', format='table', complevel=0)

In [None]:
187 * 101

df_probs.dtypes

In [None]:
probs_per_kmer = df_probs[prob_cols].to_numpy().reshape(df_probs.shape[0], -1, 187)
top_preds = df[top_pred_cols].to_numpy()
preds = top_preds[:, 0]

In [None]:
correct_pred_idxs = preds == 118
incorrect_pred_idxs = preds != 118
true_positives = correct_pred_idxs.sum()
false_negatives = incorrect_pred_idxs.sum()
accuracy = true_positives/(true_positives + false_negatives)
print(f"True positives:  {true_positives}")
print(f"False negatives: {false_negatives}")
print(f"Accuracy:        {accuracy:.2%}")

### Plot prediction probabilities

#### Mixed predictions

Show profile of 101 label probabilities of each 50-mer corresponding to a k-mer read

There are about 17k reads, do not use this function for more than 50 reads at the time

In [None]:
first_seq = 0
nb_seq = 25
s = slice(first_seq, first_seq+nb_seq)

plot_label_probs(probs_per_kmer[s,:,:], top_preds[s], [118])

#### True Positive Only

In [None]:
first_seq = 1000
nb_seq = 25
s = slice(first_seq, first_seq+nb_seq)

plot_label_probs(probs_per_kmer[correct_pred_idxs][s,:,:], top_preds[correct_pred_idxs][s], [118])

#### False Positive Only

In [None]:
first_seq = 1000
nb_seq = 25
s = slice(first_seq, first_seq+nb_seq)

plot_label_probs(probs_per_kmer[incorrect_pred_idxs][s,:,:], top_preds[incorrect_pred_idxs][s], [118])

## Load result in chunks

In [None]:
p2resfile = pfs.project_root /'data/ncov_data/reads/yf/yf-mapped-results.csv'

def result_generator(p2file, chunksize=1000):
    for chunk in pd.read_csv(p2file, chunksize=chunksize, index_col=0):
        yield chunk

def get_chunk(p2file, chunknb, chunksize=1000):
    gen = result_generator(p2file, chunksize=chunksize)
    for i in range(chunknb):
        df = next(gen)
    return df

In [None]:
chunk = get_chunk(p2resfile, chunknb=4, chunksize=1000)
chunk.shape

In [None]:
probs_per_kmer = chunk[prob_cols].to_numpy().reshape(chunk.shape[0], -1, 187)
top_preds = chunk[top_pred_cols].to_numpy()
preds = top_preds[:, 0]

In [None]:
correct_pred_idxs = preds == 118
incorrect_pred_idxs = preds != 118
true_positives = correct_pred_idxs.sum()
false_negatives = incorrect_pred_idxs.sum()
accuracy = true_positives/(true_positives + false_negatives)
print(f"True positives:  {true_positives}")
print(f"False negatives: {false_negatives}")
print(f"Accuracy:        {accuracy:.2%}")

#### Mixed Predictions

In [None]:
first_seq = 0
nb_seq = 25
s = slice(first_seq, first_seq+nb_seq)

plot_label_probs(probs_per_kmer[s,:,:], top_preds[s], [118])

#### True Positive Only

In [None]:
first_seq = 0
nb_seq = 25
s = slice(first_seq, first_seq + nb_seq)

plot_label_probs(probs_per_kmer[correct_pred_idxs][s,:,:], top_preds[correct_pred_idxs][s], [118])

#### False Positive Only

In [None]:
first_seq = 0
nb_seq = 25
s = slice(first_seq, first_seq+nb_seq)

plot_label_probs(probs_per_kmer[incorrect_pred_idxs][s,:,:], top_preds[incorrect_pred_idxs][s], [118])

# Work with HDF5 Data Stores

### Create a DataFrame from an existing probability file

Create a `read_store` to read some of the probabilities

In [None]:
paths = files_in_tree(path=p2fastq.parent)
paths[4].suffix == '.hdf5'

In [None]:
p2probs = paths[4]
p2probs

In [None]:
read_store = HDFStore(path=p2probs, mode='r')
keys = read_store.keys()
keys[:8]

In [None]:
total_nb_labels = len(read_store.keys())
total_nb_labels

In [None]:
nrows = 4096
nblabels = total_nb_labels
keys = read_store.keys()
df = read_store.select(key=keys[0], start=0, stop=nrows)
for k in tqdm(keys[1:]):
    df = pd.concat([df, read_store.select(key=k, start=0, stop=nrows)], axis=1)

In [None]:
df.shape

In [None]:
p2hdf5 = Path("temp.hdf5")
if p2hdf5.exists(): p2hdf5.unlink()

nblabels = 10
nrows = 32

# Store probabilities in hdf5 format, with one label per key

with HDFStore(path=p2hdf5, mode='a') as store:

    # Replace '/' in index as it is not supported by hdf5
    df.index = [idx.replace('/','-') for idx in df.index]
    cols = df.columns

    for n in trange(nblabels):
        label_cols = [c for c in cols if c.endswith(f"ProbL{n}")]
        store.put(
            key=f'label_{n:03d}', 
            value=df.iloc[:nrows, :].loc[:,label_cols], 
            format='table', 
            append=True, 
            index=True,
            data_columns=True
        )

    print(store.keys())

In [None]:
with HDFStore(path=p2hdf5, mode='r') as store:
    print(store.get(key='/label_001').shape)

In [None]:
with HDFStore(path=p2hdf5, mode='a') as store:

    # Replace '/' in index as it is not supported by hdf5
    df.index = [idx.replace('/','-') for idx in df.index]
    cols = df.columns

    for n in trange(nblabels):
        label_cols = [c for c in cols if c.endswith(f"ProbL{n}")]
        store.put(
            key=f'label_{n:03d}', 
            value=df.iloc[100:100+nrows, :].loc[:,label_cols], 
            format='table', 
            append=True, 
            index=True,
            data_columns=True
        )

    print(store.keys())

In [None]:
with HDFStore(path=p2hdf5, mode='r') as store:
    print(store.get(key='/label_001').shape)

In [None]:
with HDFStore(path=p2hdf5, mode='a') as store:

    # Replace '/' in index as it is not supported by hdf5
    df.index = [idx.replace('/','-') for idx in df.index]
    cols = df.columns

    for n in trange(nblabels):
        label_cols = [c for c in cols if c.endswith(f"ProbL{n}")]
        store.put(
            key=f'label_{n:03d}', 
            value=df.iloc[300:300+nrows, :].loc[:,label_cols], 
            format='table', 
            append=True, 
            index=True,
            data_columns=True
        )

    print(store.keys())

In [None]:
with HDFStore(path=p2hdf5, mode='r') as store:
    print(store.get(key='/label_001').shape)