In [1]:
import os
import functools
from typing import *
from glob import glob
import pickle

import torch
import numpy as np
import pandas as pd

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
from sklearn.cluster import KMeans
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.features import Manifold
from yellowbrick.cluster import KElbowVisualizer, InterclusterDistance, SilhouetteVisualizer
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from altlabs.index_mapping import create_index_mapping
from altlabs.dataset import (
    noop,
    random_roll,
    SoftmaxDataset,
    limit_sequence_size,
    FactorizationDataset,
)
from altlabs.torch.data import FasterBatchSampler, NoAutoCollationDataLoader
from altlabs.utils import Pipeline
from pytorch_lightning import seed_everything
from sklearn.metrics import top_k_accuracy_score
%matplotlib inline

In [2]:
triplet_model_paths = list(sorted(glob("output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/*/checkpoints/*.ckpt")))
lab_index_mapping_paths = list(sorted(glob("output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/*/lab_index_mapping.pkl")))
from altlabs.model.conv1d_triplet_classifier import Conv1dTripletClassifier, ModelConfig





  self._sequences = np.array([np.array(s) for s in self._sequences])
  self._sequences = np.array([np.array(s) for s in self._sequences])


In [3]:
lab_index_mapping_paths 

['output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/0_0/lab_index_mapping.pkl',
 'output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/1_1/lab_index_mapping.pkl',
 'output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/2_2/lab_index_mapping.pkl',
 'output/daaefeed-3f3f-43a0-b7c2-2abf04e31e72/tensorboard_logs_csv_logs/3_3/lab_index_mapping.pkl']

In [10]:
DATA_PATH = "../data/"
RESULTS_PATH = "../results/"

In [5]:
device = torch.device("cuda:0")

train_values_df = pd.read_csv(f"{DATA_PATH}train_values_grouped.csv")
format_df = pd.read_csv(f"{DATA_PATH}format.csv")
test_values_df = pd.read_csv(f"{DATA_PATH}test_values.csv")
test_set = pd.read_csv(f"{DATA_PATH}test_labels.csv")
pub_id = pd.read_csv(f"{DATA_PATH}pubsubidx.csv")

In [6]:
from datetime import datetime
time_init = datetime.now()

pub_index = pub_id[pub_id.public==True].index
private_index = pub_id[pub_id.public==False].index
display(datetime.now() - time_init)


datetime.timedelta(microseconds=4730)

In [7]:

time_init = datetime.now()
seed_everything(350)

def predict_dataset(model: Conv1dTripletClassifier, dataset: FactorizationDataset, tta_steps: int) -> np.ndarray:
    batch_sampler = FasterBatchSampler(
        dataset, 32, shuffle=False,
    )

    model.to(device)

    predictions: List[List[float]] = []
    with torch.no_grad():
        for indices in batch_sampler:
            if tta_steps > 0:
                tta_predictions = []
                for i in range(tta_steps):
                    batch = dataset[indices]
                    if isinstance(batch[0], tuple):
                        (sequences, extra_inputs, _) = batch[
                            0
                        ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                    else:
                        (sequences, extra_inputs) = batch
                    outputs = model.predict_lab_scores(
                        sequences.to(device), extra_inputs.to(device)
                    ).tolist()
                    tta_predictions.append(np.array(outputs))
                predictions.extend(
                    np.mean(np.array(tta_predictions), axis=0).tolist()
                )
            else:
                batch = dataset[indices]
                if isinstance(batch[0], tuple):
                    (sequences, extra_inputs, _) = batch[
                        0
                    ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                else:
                    (sequences, extra_inputs) = batch
                outputs = model.predict_lab_scores(
                    sequences.to(device), extra_inputs.to(device)
                ).tolist()
                predictions.extend(outputs)

    return np.array(predictions)

sequence_index_mapping = create_index_mapping(
"ATGC", include_unkown=True, include_none=False,
)
sequence_index_mapping["N"] = 0
input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns

fold_output = []
for triplet_model_path, lab_index_mapping_path in zip(triplet_model_paths, lab_index_mapping_paths):
    model = Conv1dTripletClassifier.load_from_checkpoint(triplet_model_path)
    with open(lab_index_mapping_path, "rb") as f:
        lab_index_mapping = pickle.load(f)
    dataset = FactorizationDataset(
        test_values_df,
        sequence_index_mapping,
        lab_index_mapping,
        input_columns,
        lab_column="output",
        negative_proportion=0.0,
        transform_sequence_fn=random_roll,
        test=True,
        bpe=True,
    )

    outputs = predict_dataset(model, dataset, 10)
    fold_output.append(outputs)
final_outputs = np.mean(fold_output, axis=0)

for lab in format_df.columns[1:]:
    lab_index = lab_index_mapping[lab]
    format_df[lab] = final_outputs[:, lab_index]

format_df = format_df.round(6)

display(datetime.now() - time_init)
    

datetime.timedelta(seconds=1649, microseconds=658815)

In [8]:
pub_best_sub = format_df[format_df.index.isin(pub_index)]
private_best_sub = format_df[format_df.index.isin(private_index)]
private_test = test_set[test_set.index.isin(private_index)]
public_test = test_set[test_set.index.isin(pub_index)]
private_labels = private_test.drop(columns="sequence_id").values.argmax(axis=1)
public_labels = public_test.drop(columns="sequence_id").values.argmax(axis=1)

top_10_score_private = top_k_accuracy_score(private_labels, private_best_sub.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_private = top_k_accuracy_score(private_labels, private_best_sub.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_public = top_k_accuracy_score(public_labels, pub_best_sub.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_public = top_k_accuracy_score(public_labels, pub_best_sub.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_total = top_k_accuracy_score(test_set.drop(columns="sequence_id").values.argmax(axis=1), format_df.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_total = top_k_accuracy_score(test_set.drop(columns="sequence_id").values.argmax(axis=1), format_df.drop(columns="sequence_id").values, k=1, labels=range(0,1314))



display(f"Triplet model top-10 private score: {top_10_score_private}")

display(f"Triplet model top-1 private score: {top_1_score_private}")


display(f"Triplet model top-10 public score: {top_10_score_public}")

display(f"Triplet model top-1 public score: {top_1_score_public}")

display(f"Triplet model top-10 total score: {top_10_score_total}")

display(f"Triplet model top-1 total score: {top_1_score_total}")



'Triplet model top-10 private score: 0.9039732182186592'

'Triplet model top-1 private score: 0.7423134525592459'

'Triplet model top-10 public score: 0.918821165438714'

'Triplet model top-1 public score: 0.7815137307434695'

'Triplet model top-10 total score: 0.9098639455782312'

'Triplet model top-1 total score: 0.7578656462585034'

In [11]:
format_df.to_csv(f"{RESULTS_PATH}triplet_predictions.csv", index=False)

Model trained without UNKNOWN Category

In [12]:
triplet_model_unk_paths = list(sorted(glob("output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/*/checkpoints/*.ckpt")))
lab_index_mapping_unk_paths = list(sorted(glob("output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/*/lab_index_mapping.pkl")))

In [13]:
triplet_model_unk_paths

['output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/0_0/checkpoints/epoch=184.ckpt',
 'output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/1_1/checkpoints/epoch=195.ckpt',
 'output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/2_2/checkpoints/epoch=195.ckpt',
 'output/1d78ec4a-0968-48e2-8747-e331132efa60/tensorboard_logs_csv_logs/3_3/checkpoints/epoch=189.ckpt']

In [14]:
seed_everything(350)
def predict_dataset(model: Conv1dTripletClassifier, dataset: FactorizationDataset, tta_steps: int) -> np.ndarray:
    batch_sampler = FasterBatchSampler(
        dataset, 32, shuffle=False,
    )

    model.to(device)

    predictions: List[List[float]] = []
    with torch.no_grad():
        for indices in batch_sampler:
            if tta_steps > 0:
                tta_predictions = []
                for i in range(tta_steps):
                    batch = dataset[indices]
                    if isinstance(batch[0], tuple):
                        (sequences, extra_inputs, _) = batch[
                            0
                        ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                    else:
                        (sequences, extra_inputs) = batch
                    outputs = model.predict_lab_scores(
                        sequences.to(device), extra_inputs.to(device)
                    ).tolist()
                    tta_predictions.append(np.array(outputs))
                predictions.extend(
                    np.mean(np.array(tta_predictions), axis=0).tolist()
                )
            else:
                batch = dataset[indices]
                if isinstance(batch[0], tuple):
                    (sequences, extra_inputs, _) = batch[
                        0
                    ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                else:
                    (sequences, extra_inputs) = batch
                outputs = model.predict_lab_scores(
                    sequences.to(device), extra_inputs.to(device)
                ).tolist()
                predictions.extend(outputs)

    return np.array(predictions)

sequence_index_mapping = create_index_mapping(
"ATGC", include_unkown=True, include_none=False,
)
sequence_index_mapping["N"] = 0
input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns

fold_output = []
for triplet_model_path, lab_index_mapping_path in zip(triplet_model_unk_paths, lab_index_mapping_unk_paths):
    model = Conv1dTripletClassifier.load_from_checkpoint(triplet_model_path)
    with open(lab_index_mapping_path, "rb") as f:
        lab_index_mapping = pickle.load(f)
    dataset = FactorizationDataset(
        test_values_df,
        sequence_index_mapping,
        lab_index_mapping,
        input_columns,
        lab_column="output",
        negative_proportion=0.0,
        transform_sequence_fn=random_roll,
        test=True,
        bpe=True,
    )

    outputs = predict_dataset(model, dataset, 10)
    fold_output.append(outputs)



    

In [15]:
final_outputs = np.mean(fold_output, axis=0)

for lab in format_df.columns[1:]:
    lab_index = lab_index_mapping[lab]
    format_df[lab] = final_outputs[:, lab_index]

format_df = format_df.round(6)

In [16]:
pub_best_sub = format_df[format_df.index.isin(pub_index)]
private_best_sub = format_df[format_df.index.isin(private_index)]
private_test = test_set[test_set.index.isin(private_index)]
public_test = test_set[test_set.index.isin(pub_index)]
private_labels = private_test.drop(columns="sequence_id").values.argmax(axis=1)
public_labels = public_test.drop(columns="sequence_id").values.argmax(axis=1)


unk_eng_plasmids = test_set[test_set["I7FXTVDP"] == 1.0]["sequence_id"].tolist()
test_set_no_unk = test_set[~test_set["sequence_id"].isin(unk_eng_plasmids)]
format_df_no_unk = format_df[~format_df["sequence_id"].isin(unk_eng_plasmids)]

pub_best_sub_no_unk = format_df_no_unk[format_df_no_unk.index.isin(pub_index)]
private_best_sub_no_unk = format_df_no_unk[format_df_no_unk.index.isin(private_index)]
private_test_no_unk = test_set_no_unk[test_set_no_unk.index.isin(private_index)]
public_test_no_unk = test_set_no_unk[test_set_no_unk.index.isin(pub_index)]
private_labels_no_unk = private_test_no_unk.drop(columns="sequence_id").values.argmax(axis=1)
public_labels_no_unk = public_test_no_unk.drop(columns="sequence_id").values.argmax(axis=1)



top_10_score_private_no_unk = top_k_accuracy_score(private_labels_no_unk, private_best_sub_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_private_no_unk = top_k_accuracy_score(private_labels_no_unk, private_best_sub_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_public_no_unk = top_k_accuracy_score(public_labels_no_unk, pub_best_sub_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_public_no_unk = top_k_accuracy_score(public_labels_no_unk, pub_best_sub_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))



top_10_score_total_no_unk = top_k_accuracy_score(test_set_no_unk.drop(columns="sequence_id").values.argmax(axis=1), format_df_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))
top_1_score_total_no_unk = top_k_accuracy_score(test_set_no_unk.drop(columns="sequence_id").values.argmax(axis=1), format_df_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))


display(f"Triplet model top-10 private score (no UNK): {top_10_score_private_no_unk}")

display(f"Triplet model top-1 private score (no UNK): {top_1_score_private_no_unk}")


display(f"Triplet model top-10 public score (no UNK): {top_10_score_public_no_unk}")

display(f"Triplet model top-1 public score (no UNK): {top_1_score_public_no_unk}")

display(f"Triplet model top-10 total score (no UNK): {top_10_score_total_no_unk}")

display(f"Triplet model top-1 total score (no UNK): {top_1_score_total_no_unk}")


'Triplet model top-10 private score (no UNK): 0.9044306812767985'

'Triplet model top-1 private score (no UNK): 0.7824678418294426'

'Triplet model top-10 public score (no UNK): 0.9204719583205639'

'Triplet model top-1 public score (no UNK): 0.8236285626723874'

'Triplet model top-10 total score (no UNK): 0.9105810469420128'

'Triplet model top-1 total score (no UNK): 0.7982492215498502'

In [17]:
import os
RESULTS_PATH = "../results/"

if os.path.exists(f"{RESULTS_PATH}results.csv"):
    result_df = pd.read_csv(f"{RESULTS_PATH}results.csv")
else:
    result_df = pd.DataFrame([], columns=["Model", "Top 10 Score", "Top 1 Score", "Top 10 New Data Score", "Top 1 New Data Score", "Top 10 Total Score", "Top 1 Total Score"])
    

result_df = result_df.append({"Model": "Metric Learning model", 
                              "Top 10 Score": top_10_score_private, "Top 1 Score": top_1_score_private, 
                              "Top 10 New Data Score": top_10_score_public, "Top 1 New Data Score": top_1_score_public,
                             "Top 10 Total Score": top_10_score_total, "Top 1 Total Score": top_1_score_total}, ignore_index=True)

result_df = result_df.append({"Model": "Metric Learning model (no UNK)", 
                              "Top 10 Score": top_10_score_private_no_unk, "Top 1 Score": top_1_score_private_no_unk, 
                              "Top 10 New Data Score": top_10_score_public_no_unk, "Top 1 New Data Score": top_1_score_public_no_unk,
                             "Top 10 Total Score": top_10_score_total_no_unk, "Top 1 Total Score": top_1_score_total_no_unk}, ignore_index=True)

result_df.to_csv(f"{RESULTS_PATH}results.csv", index=False)