In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Uncomment to disable GPU
from model import Model, DatasetName, load_model, remove_model
import numpy as np

__ORIG_WD__ = os.getcwd()

os.chdir(f"{__ORIG_WD__}/../data_collectors/")
from covid19_genome import Covid19Genome

os.chdir(__ORIG_WD__)


2023-08-20 14:12:14.116365: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-20 14:12:14.142632: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
try:
    model = load_model("covid19-reads-1024examples")
except Exception:
    covid19_genome = Covid19Genome()
    lineages = covid19_genome.getLocalLineages(1024)
    lineages.sort()
    dataset = []
    def get_dataset(lower, upper):
        for lineage in lineages[lower:upper]:
            dataset.append((lineage, covid19_genome.getLocalAccessionsPath(lineage)))
        return dataset

    portions = {
        DatasetName.trainset.name: 0.8,
        DatasetName.validset.name: 0.1,
        DatasetName.testset.name: 0.1
    }

    dataset = get_dataset(0, 200)
    model = Model("covid19-reads-1024examples")
    model.create_datasets(dataset, portions, minhash_dataset=True)

2023-08-20 14:12:15.108095: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-20 14:12:15.123411: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-20 14:12:15.123571: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [3]:
coverage = 2
ml_model_depth = 4

In [None]:
def get_model_name(ml_model_depth, coverage):
    return f"vit.{ml_model_depth}.{coverage}x"

ml_model_name = get_model_name(ml_model_depth, coverage)

In [None]:
newly_added = True
try: 
    model.add_ml_model("vit", ml_model_name, hps={
        "optimizer": {
            "name": "AdamW",
            "params": {
                "learning_rate": 0.001,
            },
        },
        "encoder_repeats": 2,
        "batch_size": 256,
        "regularizer": {
            "name": "l2",
            "params": {
                "l2": 0.0003
            }
        },
        "d_key": 128,
        "d_value": 128,
        "d_ff": 1024+256,
        "dropout": 0.2,
    })

except:
    newly_added = False
    print("Model already exists")

In [None]:
models = model.list_ml_models()
print(models)


In [None]:
if newly_added:
    assert False, "Please consider doing transfer learning"
    # model.transfer("vit.2.00001.adamw.coverage2", "vit.2.00001.adamw.coverage1", False)

In [None]:
# model.change_ml_hps(ml_model_name, {
#     "regularizer": {
#         "name": "l2",
#         "params": {
#             "l2": 0.00005,
#         },
#     },
#     "optimizer": {
#         "name": "AdamW",
#         "params": {
#             "learning_rate": 0.00001,
#         },
#     },
#     # "batch_size": 320,
# })

In [9]:
model.set_coverage(coverage)
model.set_substitution_rate(0.01)
model.set_insertion_rate(0.005)
model.train(ml_model_name, epochs=1000)

In [None]:
results = model.predict_reads(ml_model_name, "../genome_builder/data/reads-4x/")

In [None]:
covid19_genome = Covid19Genome()
lineages = covid19_genome.getLocalLineages(1024)
lineages.sort()

In [None]:
accession_to_lineage_map = {}
for lineage in lineages:
    accessions = covid19_genome.getLocalAccessions(lineage)
    for accession in accessions:
        accession_to_lineage_map.update({accession.split("/")[-1].split(".")[0]: lineage})

In [None]:
import tensorflow as tf

correct = 0
incorrect = 0
labels = model.datasets["trainset"].get_labels()
for result in results:
    print(tf.math.top_k(results[result],  k = 1))
    result_index = tf.math.top_k(results[result],  k = 1).indices.numpy()[0][0]
    print(labels[result_index])
    print(f"real result: {accession_to_lineage_map[reads_accession]}")
    reads_accession = result.split("/")[-1].split(".")[0]
    if labels[result_index] == accession_to_lineage_map[reads_accession]:
        correct += 1
    else:
        incorrect += 1


In [None]:
correct