In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Uncomment to disable GPU
from model import Model, DatasetName, load_model
import numpy as np

__ORIG_WD__ = os.getcwd()

os.chdir(f"{__ORIG_WD__}/../data_collectors/")
from ncbi_genome import NCBIGenome


os.chdir(__ORIG_WD__)



In [None]:
ncbi_genome = NCBIGenome()
taxa = ncbi_genome.getLocalTaxa()

dataset = []
for taxon in taxa:
    dataset.append((taxon, ncbi_genome.getLocalAccessions(taxon)))

portions = {
    DatasetName.trainset.name: 0.8,
    DatasetName.validset.name: 0.1,
    DatasetName.testset.name: 0.1
}

try:
    model = Model("ncbi_bacteria")
    model.create_datasets(dataset, portions)
except Exception:
    model = load_model("ncbi_bacteria")


In [None]:
try:
    model.add_ml_model("vit", "vit.3", hps={
        "encoder_repeats": 3,
        "batch_size": 256,
        "regularizer": {
            "name": "l2",
            "params": {
                "l2": 0.001
            }
        }
    })
except Exception:
    print("Model already exists")

In [None]:
# model.transfer(transfer_to_name="vit.3", transfer_from_name="vit.2")

In [None]:
model.change_hps("vit.3", {
    "regularizer": {
        "name": "l2",
        "params": {
            "l2": 0.00001
        }
    }
})
model.train("vit.3", epochs=500)

In [None]:
# load results
import json
# check if file exists
if os.path.isfile("results.json"):
    with open("results.json", "r") as f:
        results = json.load(f)
else:
    results = {}

# transform results into floats
for key in results:
    results[key] = [float(x) for x in results[key]]

# transfor keys into floats
results = {float(key): results[key] for key in results}


number_of_experiments = 100
for coverage in [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 1, 5, 30]:
    if coverage in results:
        continue
    model.set_coverage(coverage)
    results[coverage] = []
    for i in range(number_of_experiments):
        print(f"======> Experiment {i+1}/{number_of_experiments} for coverage {coverage} <======")
        results[coverage].append((model.evaluate("vit.2")[1] * 100))

# write results to json file
with open("results.json", "w") as f:
    json.dump(results, f)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

for coverage, cov_results in results.items():
    sns.set_theme(style="whitegrid")
    sns.set(rc={'figure.figsize':(11.7,8.27)})
    sns.set(font_scale=1.5)

    sns.histplot(cov_results, kde=True, stat="density", linewidth=1.2, edgecolor="black", color="g")

    mean = np.mean(cov_results)
    std_dev = np.std(cov_results)

    # Add a vertical line for the mean
    plt.axvline(mean, color='r', linestyle='--', label=f"mean = {mean.round(3)}")

    # Add vertical lines for one std dev away from the mean
    plt.axvline(mean - std_dev, color='b', linestyle='--', label=f"std   = {std_dev.round(3)}")
    plt.axvline(mean + std_dev, color='b', linestyle='--')

    plt.xlabel("Accuracy")
    plt.ylabel("Density")
    plt.title(f"Accuracy Distribution of VIT Model with coverage of {coverage}x")
    plt.legend()

    plt.savefig(f"vit.2-{coverage}x.png")
    #clear plt
    plt.clf()


In [None]:
# plt a graph of mean accuracy vs. coverage only for coverage <= 0.1
#scale up the graph, and make the lines andd font bigger and easier to see
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)

mean_accuracies = []
for coverage, cov_results in results.items():
    if coverage > 0.1:
        continue
    mean_accuracies.append(np.mean(cov_results))

sns.lineplot(x=[0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1], y=mean_accuracies, marker="o", color="g", linewidth=4, markersize=10)
plt.xlabel("Coverage")
plt.ylabel("Mean Accuracy")
plt.title(f"Mean Accuracy vs. Coverage of VIT Model")
plt.savefig(f"vit.2-mean-accuracy-vs-coverage.png")
#clear plt
plt.clf()


In [15]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# generate random vector of length 20, where the vector sums to 1
vector = np.random.dirichlet(np.ones(20),size=1)[0]
vector /= 5
vector[4] += 0.8


# draw the vector as a chart
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)


sns.barplot(x=[i for i in range(20)], y=vector, color="g")

# scale y axis to have a max of 1 in the plot
plt.ylim(0, 1)

plt.xlabel("Lineages")
plt.ylabel("probability")
plt.title(f"Network output")
plt.savefig(f"random-vector2.png")
#clear plt
plt.clf()

<Figure size 1170x827 with 0 Axes>

<Figure size 1170x827 with 0 Axes>