In [1]:
import os
import functools
from typing import *
from glob import glob
import pickle

import torch
import numpy as np
import pandas as pd

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
from sklearn.cluster import KMeans
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.features import Manifold
from yellowbrick.cluster import KElbowVisualizer, InterclusterDistance, SilhouetteVisualizer
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from altlabs.index_mapping import create_index_mapping
from altlabs.dataset import (
    noop,
    random_roll,
    _convert_to_indices,
    SoftmaxDataset,
    limit_sequence_size,
    get_random_piece,
    FactorizationDataset,
)
from altlabs.torch.data import FasterBatchSampler, NoAutoCollationDataLoader
from altlabs.utils import Pipeline
from pytorch_lightning import seed_everything
from sklearn.metrics import top_k_accuracy_score
%matplotlib inline

In [2]:
softmax_model_paths = list(sorted(glob("output/56836160-1c29-4909-814d-b37d77e86ffc/tensorboard_logs_csv_logs/*/checkpoints/*.ckpt")))
lab_index_mapping_paths = list(sorted(glob("output/56836160-1c29-4909-814d-b37d77e86ffc/tensorboard_logs_csv_logs/*/lab_index_mapping.pkl")))
from altlabs.model.conv1d_attn_softmax_classifier import Conv1dAttnSoftmaxClassifier, ModelConfig





  self._sequences = np.array([np.array(s) for s in self._sequences])


Softmax model score: 0.8813320412298475


  self._sequences = np.array([np.array(s) for s in self._sequences])


Softmax model score: 0.8947229319002731


In [3]:
DATA_PATH = "../data/"
device = torch.device("cuda:0")

train_values_df = pd.read_csv(f"{DATA_PATH}train_values_grouped.csv")
train_labels_df = pd.read_csv(f"{DATA_PATH}train_labels.csv")
format_df = pd.read_csv(f"{DATA_PATH}format.csv")
test_values_df = pd.read_csv(f"{DATA_PATH}test_values.csv")
test_set = pd.read_csv(f"{DATA_PATH}test_labels.csv")
pub_id = pd.read_csv(f"{DATA_PATH}pubsubidx.csv")

In [4]:
pub_index = pub_id[pub_id.public==True].index
private_index = pub_id[pub_id.public==False].index

In [7]:
transform_sequence_fn = Pipeline(
    random_roll,
    functools.partial(limit_sequence_size, limit=1000))

In [8]:
seed_everything(350)

def predict_dataset(model: Conv1dAttnSoftmaxClassifier, dataset: FactorizationDataset, tta_steps: int) -> np.ndarray:
    batch_sampler = FasterBatchSampler(
        dataset, 32, shuffle=False,
    )

    model.to(device)
    predictions: List[List[float]] = []
    with torch.no_grad():
        for indices in batch_sampler:
            if tta_steps > 0:
                tta_predictions = []
                for i in range(tta_steps):
                    batch = dataset[indices]
                    if isinstance(batch[0], tuple):
                        (sequences, extra_inputs, _) = batch[
                            0
                        ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                    else:
                        (sequences, extra_inputs) = batch
                    outputs = torch.nn.functional.softmax(model(
                        sequences.to(device), extra_inputs.to(device)
                    )).tolist()
                    tta_predictions.append(np.array(outputs))
                predictions.extend(
                    np.mean(np.array(tta_predictions), axis=0).tolist()
                )
            else:
                batch = dataset[indices]
                if isinstance(batch[0], tuple):
                    (sequences, extra_inputs, _) = batch[
                        0
                    ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                else:
                    (sequences, extra_inputs) = batch
                outputs = torch.nn.functional.softmax(model(
                    sequences.to(device), extra_inputs.to(device)
                )).tolist()
                predictions.extend(outputs)

    return np.array(predictions)

sequence_index_mapping = create_index_mapping(
    "ATGC", include_unkown=True, include_none=False,
)
sequence_index_mapping["N"] = 0
input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns
output_columns = train_labels_df.drop(columns=["sequence_id"]).columns
occurrences = np.sum(train_labels_df[output_columns].values, axis=0)
minimum_occurrences = 1
filtered_out_output_columns = output_columns[
    occurrences < minimum_occurrences
]
output_columns = output_columns.drop(filtered_out_output_columns)

fold_output = []
for softmax_model_path in softmax_model_paths:
    model = Conv1dAttnSoftmaxClassifier.load_from_checkpoint(softmax_model_path)
    
    model.model_config.positional_encoding = True
    
    dataset = SoftmaxDataset(
        test_values_df,
        sequence_index_mapping,
        input_columns,
        transform_sequence_fn=transform_sequence_fn,
        test=True,
        bpe=True,
    )
    
    outputs = predict_dataset(model, dataset, 10)
    fold_output.append(outputs)
final_outputs = np.mean(fold_output, axis=0)


df = pd.DataFrame(
    data=final_outputs, columns=output_columns, index=test_values_df["sequence_id"]
)

for column in filtered_out_output_columns:
    df[column] = 0.0
df = df[format_df.drop(columns=["sequence_id"]).columns]


pub_best_sub = df.reset_index()[df.reset_index().index.isin(pub_index)]
private_best_sub = df.reset_index()[df.reset_index().index.isin(private_index)]
private_test = test_set[test_set.index.isin(private_index)]
public_test = test_set[test_set.index.isin(pub_index)]
private_labels = private_test.drop(columns="sequence_id").values.argmax(axis=1)
public_labels = public_test.drop(columns="sequence_id").values.argmax(axis=1)

score = top_k_accuracy_score(private_labels, private_best_sub.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

display(f"Softmax model score: {score}")



In [None]:
import os
RESULTS_PATH = "../results/"

if os.path.exists(f"{RESULTS_PATH}results.csv"):
    result_df = pd.read_csv(f"{RESULTS_PATH}results.csv")
else:
    result_df = pd.DataFrame([], columns=["Model", "Top 10 Score"])
    

result_df = result_df.append({"Model": "Softmax model", "Top 10 Score": score}, ignore_index=True)

result_df.to_csv(f"{RESULTS_PATH}results.csv", index=False)