In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Uncomment to disable GPU
from model import Model, DatasetName, load_model, remove_model
import numpy as np
import glob

__ORIG_WD__ = os.getcwd()

os.chdir(f"{__ORIG_WD__}/../data_collectors/")
from covid19_genome import Covid19Genome

os.chdir(__ORIG_WD__)


2023-08-28 13:34:47.779802: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-28 13:34:47.807050: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def build_dataset():
    # extract accession paths
    accessions_paths = glob.glob("../genome_builder/data/new-reads-4x/*.fq")
    accessions_paths = [os.path.abspath(accessions_path) for accessions_path in accessions_paths]


    # extract lineages
    covid19_genome = Covid19Genome()
    lineages = covid19_genome.getLocalLineages(1024)
    lineages.sort()

    # build accession lineage map
    accession_lineage_map = {}
    for lineage in lineages:
        for accession in covid19_genome.getLocalAccessions(lineage):
            accession_lineage_map[accession] = lineage

    # build lineage accession path map
    lineage_accession_path_map = {}
    for accession_path in accessions_paths:
        accession = accession_path.split('/')[-1].split('.')[0].split('-')[0]
        lineage = accession_lineage_map[accession]
        if not lineage in lineage_accession_path_map:
            lineage_accession_path_map[lineage] = []
        lineage_accession_path_map[lineage].append(accession_path)

    return [(lineage, accession_paths) for lineage, accession_paths in lineage_accession_path_map.items()]

In [3]:
# model_name = f"covid19-art-1024examples"
# try:
#     model = load_model(model_name)
# except Exception:
#     dataset = build_dataset()

#     portions = {
#         DatasetName.trainset.name: 0.8,
#         DatasetName.validset.name: 0.1,
#         DatasetName.testset.name: 0.1
#     }

#     model = Model("covid19-art-1024examples")
#     model.create_datasets(dataset, portions)

In [4]:
try:
    model = load_model("covid19-new-1024examples")
except Exception:
    covid19_genome = Covid19Genome()
    lineages = covid19_genome.getLocalLineages(1024)
    lineages.sort()
    dataset = []
    def get_dataset(lower, upper):
        for lineage in lineages[lower:upper]:
            dataset.append((lineage, covid19_genome.getLocalAccessionsPath(lineage)))
        return dataset

    portions = {
        DatasetName.trainset.name: 0.8,
        DatasetName.validset.name: 0.1,
        DatasetName.testset.name: 0.1
    }

    dataset = get_dataset(0, 200)
    model = Model("covid19-new-1024examples")
    model.create_datasets(model.get_ds_types()[0], dataset, portions)

2023-08-28 13:34:51.175096: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-28 13:34:51.192395: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-28 13:34:51.192534: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
model.get_ml_model_structures()
print(model.get_ml_model_structure_hps(model.get_ml_model_structures()[0]))
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(model.get_ds_props())

{'d_model': 'required', 'd_val': 'required', 'd_key': 'required', 'd_ff': 'required', 'heads': 'required', 'dropout_rate': 'optional', 'regularizer': 'optional', 'initializer': 'optional', 'activation': 'optional', 'encoder_repeats': 'required', 'labels': 'required'}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{'coverage': 5, 'read_length': 128, 'frag_len': 128, 'num_frags': 256}


In [6]:
coverage = 4
ml_model_depth = 1

In [7]:
def get_model_name(ml_model_depth, coverage):
    return f"vit.{ml_model_depth}.{coverage}x"

ml_model_name = get_model_name(ml_model_depth, coverage)

In [8]:
print(model.get_ml_model_structure_hps(model.get_ml_model_structures()[0]))

{'d_model': 'required', 'd_val': 'required', 'd_key': 'required', 'd_ff': 'required', 'heads': 'required', 'dropout_rate': 'optional', 'regularizer': 'optional', 'initializer': 'optional', 'activation': 'optional', 'encoder_repeats': 'required', 'labels': 'required'}


In [10]:
newly_added = True
try:
    model.add_ml_model(ml_model_name, hps={
        "structure": model.get_ml_model_structures()[0],
        "d_model": model.get_ds_props()["frag_len"],
        "d_val": 128,
        "d_key": 128,
        "heads": 8,
        "d_ff": 1024+256,
        "labels":  len(model.get_labels()),
        "activation": "relu",
        "optimizer": {
            "name": "AdamW",
            "params": {
                "learning_rate": 0.001,
            },
        },
        "encoder_repeats": ml_model_depth,
        "regularizer": {
            "name": "l2",
            "params": {
                "l2": 0.0001
            }
        },
        "dropout": 0.2,
    })
except:
    newly_added = False
    print("Model already exists")

Model already exists


In [11]:
models = model.list_ml_models()
print(models)

['vit.1.4x']


In [12]:
if newly_added:
    assert False, "Please consider doing transfer learning"
# model.transfer(get_model_name(ml_model_depth - 1, coverage), ml_model_name, False)

In [None]:
# model.change_ml_hps(ml_model_name, {
# #     "regularizer": {
# #         "name": "l2",
# #         "params": {
# #             "l2": 0.00005,
# #         },
# #     },
# #     "optimizer": {
# #         "name": "AdamW",
# #         "params": {
# #             "learning_rate": 0.00001,
# #         },
# #     },
# })

In [None]:
# model.set_coverage(coverage)
# model.set_substitution_rate(0.025)
# model.set_insertion_rate(0.015)
# model.set_deletion_rate(0.015)
model.train(ml_model_name, epochs=1000)

In [None]:
results = model.predict_reads(ml_model_name, "../genome_builder/data/new-reads-4x/")

In [None]:
covid19_genome = Covid19Genome()
lineages = covid19_genome.getLocalLineages(1024)
lineages.sort()

In [None]:
accession_to_lineage_map = {}
for lineage in lineages:
    accessions = covid19_genome.getLocalAccessions(lineage)
    for accession in accessions:
        accession_to_lineage_map.update({accession.split("/")[-1].split(".")[0]: lineage})

In [None]:
import tensorflow as tf

correct = 0
incorrect = 0
labels = model.datasets["trainset"].get_labels()
for result in results:
    # print(tf.math.top_k(results[result],  k = 1))
    result_index = tf.math.top_k(results[result],  k = 1).indices.numpy()[0][0]
    print(labels[result_index])
    reads_accession = result.split("/")[-1].split(".")[0].split("-")[0]
    if reads_accession not in accession_to_lineage_map:
        continue
    print(f"real result: {accession_to_lineage_map[reads_accession]}")
    
    if labels[result_index] == accession_to_lineage_map[reads_accession]:
        correct += 1
    else:
        incorrect += 1


In [None]:
print(f"accurcay = {correct / (correct + incorrect)}")

In [None]:
incorrect

In [None]:
ml_model_name