# Build inference result dataset

**Objective**: create a dataset with the result of inference using simulated reads using  a subset/all the sequences as reference.

**Constraints**: cannot load the totalily of the data into memory for inference because there are to many reads:
- estimated 50k reads per sequence. for the 3k sequences, this means 150 million reads to handle !

**Pipeline Idea**:

Use a large fast file and iterate over manageble chuncks. For each chunck:
- create a fasta file for the chunk (`.fa`)
- create simreads with Art Illumina (`fq`, `aln`)
- preprocess simreads into ds and info
- use model to infer taxonomy and position
- build the inference result dataset including"
    - predicted result
    - ref sequence metadata
    - position ground truth
- save partial inference result dataset as parquet

When iteration is done:
- merge all partial inference result datasets into one single dataset


**Intermediate steps**: use groups of sequence to experiments and still get some statistically relevant info

# Setup


In [None]:
from ecutilities.ipython import nb_setup, pandas_nrows_ncols
nb_setup()

Set autoreload mode


In [None]:
import numpy as np
import os
import pandas as pd
import tempfile
from nbdev import show_doc
from pathlib import Path
from pprint import pprint
from metagentools.art import ArtIllumina
from metagentools.cnn_virus.data import FastaFileReader, FastaFileIterator, parse_metadata_fasta_cov

# Build pipeline

**Pipeline Idea**:

Use a large fasta file and iterate over manageble chuncks. 

For each chunck:

- create a fasta file for the chunk (`.fa`)
- create simreads with Art Illumina (`fq`, `aln`)
- preprocess simreads into model input data and metadata
- use model to infer taxonomy and position
- build the inference result dataset including"
    - predicted result
    - ref sequence metadata
    - position ground truth
- save partial inference result dataset as parquet

When iteration is done:
- merge all partial inference result datasets into one single dataset

## Imports and paths

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'}

In [None]:
from ecutilities.core import validate_path
from metagentools.art import ArtIllumina, _run
from metagentools.core import TextFileBaseIterator
from metagentools.cnn_virus.architecture import load_model
from metagentools.cnn_virus.data import create_infer_ds_from_fastq, strings_to_tensors, FastqFileReader, AlnFileReader
from tensorflow.data import TextLineDataset

Path to source fasta file

In [None]:
p2cov_data = Path(f"../../../data/cov_data/").resolve()
groups_subdir = 'groups_1'
p2groups = p2cov_data / groups_subdir
assert p2groups.is_dir()

# p2refs = p2groups / 'seqs_alphacoronavirus.fa'
# p2refs = p2groups / 'seqs_porcine_deltacoronavirus.fa'
p2refs = p2cov_data / 'cov_virus_sequences.fa'

## Pipeline Functions

In [None]:
def create_small_fasta(chunck):
    print(f">>> Preparing small fasta file")
    with open(p2fa, 'w') as fp:
        fp.write(chunck)
#     with open(p2fa, 'r') as fp:
#         print(fp.readline()[:80])

In [None]:
def simreads():
    print(f">>> Simulating reads")
    app = 'art_illumina'
    art = ArtIllumina(path2app=app, input_dir=p2tdir, output_dir=p2tdir, app_in_system_path=True)
    art.sim_reads(
        input_file=p2fa.name, output_seed='simreads',
        sim_type='single', read_length=50, fold=10, 
        overwrite=True,
        print_output=False,
    )
    return art

In [None]:
def build_input_files(art):
    print(f">>> Preparing input file for cnn virus model")
    p2aln, p2fastq = sorted(art.get_last_output_files())
    p2ds, reads_info = create_infer_ds_from_fastq(p2fastq, output_dir=p2tdir, overwrite_ds=True, nsamples=None)
    return p2ds, reads_info

- https://stackoverflow.com/questions/39758094/clearing-tensorflow-gpu-memory-after-model-execution


In [None]:
def predict(p2ds):
    print(f">>> Running original model in inference")
    p2saved = Path('/home/vtec/projects/bio/metagentools/data/saved/cnn_virus_original/pretrained_model.h5')
    text_ds = TextLineDataset(p2ds).batch(32)
    ds = text_ds.map(strings_to_tensors)
    model = load_model(p2saved)
    prob_preds = model.predict(ds, verbose=1)
    lbl_preds = np.argmax(prob_preds[0], axis=1)
    return lbl_preds

In [None]:
def save_result_ds(preds, reads_info):
    print(f">>> Saving inference result DataFrame for this iteration")
    p2cov_results = Path('/home/vtec/projects/bio/metagentools/data/cov_results/cnn_virus')
    data = np.column_stack((reads_info, preds))
    cols = 'refseqid read_position refseq_strand pred_label'.split(' ')
    results = pd.DataFrame(data=data[:, 1:], index=data[:, 0], columns=cols)
    p2results = p2cov_results / f"results_{i:04d}_{(p2tdir.name)}.parquet"
    results.to_parquet(p2results)
    return p2results

## Run pipeline

In [None]:
nseqs = 50
it = TextFileBaseIterator(path=p2refs, nlines= 2 * nseqs)

# create a temporary directory where to save intermediate files
tdir = tempfile.TemporaryDirectory(prefix='infer_', suffix=None, dir=Path().resolve().absolute())
p2tdir = Path(tdir.name)
print(p2tdir)
p2fa =  p2tdir / 'small_fasta.fa'
assert p2tdir.is_dir()

infresult_paths = []

# for each iteration:
for i, chunck in enumerate(it):
    print(f"Iteration {i+1}")
    create_small_fasta(chunck)
    art = simreads()
    p2fastq, p2aln = art.get_last_output_files()
    fastq = FastqFileReader(p2fastq)
    aln = AlnFileReader(p2aln)
    p2ds, reads_info = build_input_files(art)
    preds = predict(p2ds)
    p = save_result_ds(preds, reads_info)
    infresult_paths.append(p)

tdir.cleanup()

/home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Iteration 1
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 281,040 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 2
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 285,447 reads
>>> R

>>> Preparing input file for cnn virus model
Dataset with 282,510 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 16
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 288,805 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 17
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8


>>> Preparing input file for cnn virus model
Dataset with 286,206 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 31
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 285,650 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 32
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8


>>> Preparing input file for cnn virus model
Dataset with 285,855 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 46
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 288,310 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 47
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8


>>> Preparing input file for cnn virus model
Dataset with 286,849 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 61
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
>>> Preparing input file for cnn virus model
Dataset with 284,466 reads
>>> Running original model in inference
>>> Saving inference result DataFrame for this iteration
Iteration 62
>>> Preparing small fasta file
>>> Simulating reads
Ready to operate with art: art_illumina
Input files from : /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8
Output files to :  /home/vtec/projects/bio/metagentools/nbs/cnn_virus/02_inference_original_model/infer_s4gjodd8


# Others