# Introduction to Machine Learning for  Pharmacology
## Task: Predict Lab of Origin

### 1. Import packages

In [1]:
import os
import functools
from typing import *
from glob import glob
import pickle

import torch
import numpy as np
import pandas as pd

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, StratifiedKFold
#from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE as TSNE
from sklearn.cluster import KMeans
from scipy.spatial import cKDTree
import matplotlib.pyplot as plt
import seaborn as sns
from yellowbrick.features import Manifold
from yellowbrick.cluster import KElbowVisualizer, InterclusterDistance, SilhouetteVisualizer
from sklearn.preprocessing import normalize
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
from altlabs.index_mapping import create_index_mapping
from altlabs.dataset import (
    noop,
    random_roll,
    _convert_to_indices,
    SoftmaxDataset,
    limit_sequence_size,
    get_random_piece,
    FactorizationDataset,
)
from altlabs.torch.data import FasterBatchSampler, NoAutoCollationDataLoader
from altlabs.utils import Pipeline
from pytorch_lightning import seed_everything
from sklearn.metrics import top_k_accuracy_score

from sklearn.metrics import confusion_matrix
# %matplotlib inline

### 2. Load model(s)
- CNN + softmax
- CNN + Triplet

In [2]:
softmax_model_paths = list(sorted(glob("../data/output/56836160-1c29-4909-814d-b37d77e86ffc/tensorboard_logs_csv_logs/*/checkpoints/*.ckpt")))
lab_index_mapping_paths = list(sorted(glob("../data/output/56836160-1c29-4909-814d-b37d77e86ffc/tensorboard_logs_csv_logs/*/lab_index_mapping.pkl")))
# from altlabs.model.conv1d_attn_softmax_classifier import Conv1dAttnSoftmaxClassifier, ModelConfig
# for loading the mode, we need to move .au directory to the root path of git which is capsule-3003146
from altlabs.model.conv1d_attn_softmax_classifier import Conv1dAttnSoftmaxClassifier, ModelConfig



  self._sequences = np.array([np.array(s) for s in self._sequences])


CPU times: user 21.2 s, sys: 180 ms, total: 21.4 s
Wall time: 21.5 s


### 3. Set data and results paths and load data

In [3]:
DATA_PATH = "../data/"
RESULTS_PATH = "../results/"

device = torch.device("cpu")

train_values_df = pd.read_csv(f"{DATA_PATH}train_values_grouped.csv")
train_labels_df = pd.read_csv(f"{DATA_PATH}train_labels.csv")
format_df = pd.read_csv(f"{DATA_PATH}format.csv")
test_values_df = pd.read_csv(f"{DATA_PATH}test_values.csv")
test_set = pd.read_csv(f"{DATA_PATH}test_labels.csv")
pub_id = pd.read_csv(f"{DATA_PATH}pubsubidx.csv")

pub_index = pub_id[pub_id.public==True].index
private_index = pub_id[pub_id.public==False].index

# sample the data for prediction
sample_size = 20
# sample_frac = 0.01
test_values_df_sampled = test_values_df.sample(sample_size,random_state=1)
test_set_sampled = test_set.iloc[test_values_df_sampled.index]

In [4]:
# show input
test_values_df_sampled

Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
8562,763Q6,CCTTCGGGCTTGTTAGCAGCCGGATCTCAGTGGTGGTGGTGGTGGT...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9594,H91R5,CTCTCTGGCTAACTAGAGAACCCACTGCTTACTGGCTTATCGAAAT...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7025,1LPO6,CTAAATTGTAAGCGTTAATATTTTGTTAAAATTCGCGTTAAATTTT...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2012,7D1UE,AGGTGAGCCAGTGAGTTGATTGCAGTCCAGTTACGCTGGAGTCTGA...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
9692,08XBW,GCGCCACTTCTAAATAAGCGAATTTCTTATGATTTATGATTTTTAT...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
15071,CG16U,TAATGTGAGTTAGCTCACTCATTAGGCACCCCAGGCTTTACACTTT...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18238,4YXP2,GGCAGTTCCCTACTCTCGCGTTAACGCTAGCATGGATGTTTTCCCA...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8213,7FJ8M,AATAAATTTCCTTTATTAGCCAGAAGTCAGATGCTCAAGGGGCTTC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
620,M554J,CCGTCAGATCCGCTAGCGCTACCGGACTCAGATCTGGTACCCCTTG...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18399,MQH8S,TAGAGCTAGAAATAGCAAGTTAAAATAAGGCTAGTCCGTTATCAAC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4. Data preprocesse
delete labs if there are less than 1 sequence belonging to them in the training set

In [5]:
transform_sequence_fn = Pipeline(
    random_roll,
    functools.partial(limit_sequence_size, limit=1000))


In [37]:
sequence_index_mapping = create_index_mapping(
    "ATGC", include_unkown=True, include_none=False,
)
sequence_index_mapping["N"] = 0

input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns
output_columns = train_labels_df.drop(columns=["sequence_id"]).columns
occurrences = np.sum(train_labels_df[output_columns].values, axis=0)
minimum_occurrences = 1
filtered_out_output_columns = output_columns[
    occurrences < minimum_occurrences
]
output_columns = output_columns.drop(filtered_out_output_columns)

In [39]:
# show filled out labs (you can increase the minimum_occurences)
# thinking about achieving it in an interactive way
fooc = pd.DataFrame(filtered_out_output_columns)
fooc.rename(columns={0:'filtered_out_labs'}, inplace=True)
fooc

Unnamed: 0,filtered_out_labs


### 5. Inference

In [40]:
seed_everything(350)

def predict_dataset(model: Conv1dAttnSoftmaxClassifier, dataset: FactorizationDataset, tta_steps: int) -> np.ndarray:
    batch_sampler = FasterBatchSampler(
        dataset, 32, shuffle=False,
    )

    model.to(device)
    predictions: List[List[float]] = []
    with torch.no_grad():
        for indices in batch_sampler:
            if tta_steps > 0:
                tta_predictions = []
                for i in range(tta_steps):
                    batch = dataset[indices]
                    if isinstance(batch[0], tuple):
                        (sequences, extra_inputs, _) = batch[
                            0
                        ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                    else:
                        (sequences, extra_inputs) = batch
                    outputs = torch.nn.functional.softmax(model(
                        sequences.to(device), extra_inputs.to(device)
                    )).tolist()
                    tta_predictions.append(np.array(outputs))
                predictions.extend(
                    np.mean(np.array(tta_predictions), axis=0).tolist()
                )
            else:
                batch = dataset[indices]
                if isinstance(batch[0], tuple):
                    (sequences, extra_inputs, _) = batch[
                        0
                    ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                else:
                    (sequences, extra_inputs) = batch
                outputs = torch.nn.functional.softmax(model(
                    sequences.to(device), extra_inputs.to(device)
                )).tolist()
                predictions.extend(outputs)

    return np.array(predictions)

In [41]:
os.chdir('/cluster/home/yihliu/MLinPharma/capsule-3003146/data')

In [42]:
%%time

fold_output = []
for softmax_model_path in softmax_model_paths:
    model = Conv1dAttnSoftmaxClassifier.load_from_checkpoint(softmax_model_path)
    model.model_config.positional_encoding = True
    
    dataset = SoftmaxDataset(
        test_values_df_sampled, # only for the showcase, for the exact resulst replace it with test_values_df
        sequence_index_mapping,
        input_columns,
        transform_sequence_fn=transform_sequence_fn,
        test=True,
        bpe=True,
    )
    outputs = predict_dataset(model, dataset, 10)
    fold_output.append(outputs)
final_outputs = np.mean(fold_output, axis=0)


df = pd.DataFrame(
    data=final_outputs, columns=output_columns, index=test_values_df_sampled["sequence_id"]
)


for column in filtered_out_output_columns:
    df[column] = 0.0
df = df[format_df.drop(columns=["sequence_id"]).columns]

In [43]:
# generate softmax output for pub and private sub sets
df = df.reset_index()
df.index = test_values_df_sampled.index
pub_best_sub = df[df.index.isin(pub_index)]
private_best_sub = df[df.index.isin(private_index)]

In [44]:
df

Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
8562,763Q6,7.979069e-07,3.675521e-06,1.005988e-05,1.057913e-09,2.188058e-08,2.003484e-09,4.041081e-07,4.487104e-05,0.01564062,...,7.095188e-09,2.696536e-07,5.698084e-06,3.874287e-08,1.595496e-07,1.010311e-09,3.371859e-08,1.439377e-09,4.87208e-08,5.264684e-08
9594,H91R5,6.318677e-07,1.818296e-05,3.861178e-06,4.40578e-09,1.671208e-06,0.0001043394,5.51384e-05,6.960503e-06,3.120395e-07,...,1.186578e-07,2.978356e-06,2.222654e-06,1.647494e-06,3.371715e-06,3.072278e-09,1.336448e-07,5.312367e-09,0.0002790057,1.3136e-06
7025,1LPO6,0.001324063,1.796431e-05,1.02056e-05,1.66735e-08,8.271114e-05,1.85137e-09,1.667703e-07,7.329797e-05,7.386574e-05,...,2.809847e-06,3.824033e-06,4.11901e-07,6.303115e-08,1.007823e-05,2.838526e-09,0.0002661973,1.807503e-05,0.0001281278,4.924133e-08
2012,7D1UE,1.626151e-07,2.463091e-08,1.087462e-08,4.609434e-11,1.096995e-05,1.725652e-07,4.701513e-09,6.949667e-08,0.0007127944,...,1.073851e-08,4.728307e-06,5.706815e-08,6.395503e-10,0.0002369316,1.449478e-07,1.065736e-06,2.32028e-09,1.87529e-06,6.304308e-08
9692,08XBW,8.689943e-07,1.883846e-06,3.544023e-07,2.770014e-11,1.706118e-05,3.210669e-06,3.694428e-06,6.799489e-05,1.037957e-05,...,6.522609e-06,1.101537e-07,4.936313e-08,2.196971e-07,3.247159e-05,3.236089e-09,5.265978e-06,1.915174e-07,3.581877e-09,2.703873e-06
15071,CG16U,0.0006488553,0.002979651,0.0002338993,2.6141e-05,0.0002950619,2.689785e-07,1.130019e-07,0.0001237655,0.0001530813,...,9.963289e-05,1.215044e-05,0.0001946164,0.00071351,0.0001328416,7.727913e-08,0.0005883546,0.0002205955,0.0001709917,3.826671e-06
18238,4YXP2,3.949295e-06,3.121237e-06,4.620568e-07,2.926456e-06,2.276023e-06,7.261931e-09,1.323213e-08,1.236932e-06,9.750028e-05,...,0.0001013361,1.382394e-07,3.906809e-05,1.596386e-08,2.422226e-07,3.191004e-06,1.098435e-08,6.037899e-06,5.767361e-08,2.016064e-06
8213,7FJ8M,5.251613e-05,5.154832e-05,0.002743568,6.019946e-05,0.0001684182,8.228342e-07,2.853927e-08,0.0001049513,0.001879991,...,1.74281e-05,7.230655e-07,0.0002311679,1.029956e-06,1.893149e-07,1.277094e-08,2.825465e-06,5.380128e-05,4.22204e-06,1.447473e-07
620,M554J,3.279212e-09,3.075777e-06,2.009316e-09,0.0001076668,0.0003635616,1.076119e-07,1.189297e-08,3.461129e-09,1.597586e-05,...,2.254928e-07,3.3006e-06,0.0001539989,6.316188e-07,5.77249e-06,3.032196e-06,4.374311e-09,5.986251e-09,5.254447e-06,1.378214e-06
18399,MQH8S,9.133222e-05,1.673103e-05,4.680719e-05,4.905937e-05,5.950133e-05,4.967139e-08,2.247101e-07,0.0001163712,0.001068975,...,0.0005012733,1.043876e-06,0.0006470869,3.620759e-07,1.840537e-07,1.389077e-09,2.058312e-07,0.0008195628,3.422509e-07,1.688624e-05


### 6. Show and save results

In [None]:
# pub_best_sub = df.reset_index()[df.reset_index().index.isin(pub_index)]
# private_best_sub = df.reset_index()[df.reset_index().index.isin(private_index)]


# test_set has been encoded with one-hot encoding and give us the ground truth
private_test = test_set_sampled[test_set_sampled.index.isin(private_index)]
public_test = test_set_sampled[test_set_sampled.index.isin(pub_index)]
private_labels = private_test.drop(columns="sequence_id").values.argmax(axis=1)
public_labels = public_test.drop(columns="sequence_id").values.argmax(axis=1)

# TODO: private_best_sub and private_labels shape unmatch, pub_best_sub and public_labels shape unmatch

top_10_score_private = top_k_accuracy_score(private_labels, private_best_sub.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_private = top_k_accuracy_score(private_labels, private_best_sub.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_public = top_k_accuracy_score(public_labels, pub_best_sub.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_public = top_k_accuracy_score(public_labels, pub_best_sub.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

# top_10_score_total = top_k_accuracy_score(test_set.drop(columns="sequence_id").values.argmax(axis=1), df.reset_index().drop(columns="sequence_id").values, k=10, labels=range(0,1314))

# top_1_score_total = top_k_accuracy_score(test_set.drop(columns="sequence_id").values.argmax(axis=1), df.reset_index().drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_total = top_k_accuracy_score(test_set_sampled.drop(columns="sequence_id").values.argmax(axis=1), df.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_total = top_k_accuracy_score(test_set_sampled.drop(columns="sequence_id").values.argmax(axis=1), df.drop(columns="sequence_id").values, k=1, labels=range(0,1314))


display(f"Softmax model top-10 private score: {top_10_score_private}")

display(f"Softmax model top-1 private score: {top_1_score_private}")


display(f"Softmax model top-10 public score: {top_10_score_public}")

display(f"Softmax model top-1 public score: {top_1_score_public}")

display(f"Softmax model top-10 total score: {top_10_score_total}")

display(f"Softmax model top-1 total score: {top_1_score_total}")

In [None]:
df.to_csv(f"{RESULTS_PATH}softmax_predictions.csv")

#### Visualization ideas
1. show the data (which features) + label + predicted label（done
2. one-hot results (done
3. traning, validation, test
4. network structure
5. the code structure
6. limitation?

In [None]:
# the dataset (input)
test_values_df_sampled

In [None]:
# the true label (the ground truth)
test_set_sampled

In [None]:
# Find the column with the maximum value for each row
max_columns = test_set_sampled.drop(columns='sequence_id').idxmax(axis=1)

# Create a new DataFrame combining 'sequence_id' and the column names
result = pd.DataFrame({
    'sequence_id': test_set_sampled['sequence_id'],
    'lab_of_origin': max_columns
})

# Display the result
display(result)

In [None]:
# Assuming 'df' is your DataFrame with shape 10*1315

# Exclude the 'sequence_id' column from sorting
sort_columns = [col for col in df.columns if col != 'sequence_id']

# Sort columns for each row and keep top ten columns
sorted_columns = df.apply(lambda row: sorted(zip(row[sort_columns].index, row[sort_columns]), key=lambda x: x[1], reverse=True)[:10], axis=1)
# sorted_columns = sorted_columns.reset_index().drop(columns='index')
sorted_columns = pd.DataFrame(sorted_columns)

# Create a figure and axes
fig, axes = plt.subplots(sorted_columns.shape[0], 1, figsize=(6, sorted_columns.shape[0]*3))

# Iterate over each row and create a separate plot
for idx,(index , sorted_vals) in enumerate(sorted_columns.iterrows()):
    # display(sorted_vals)
    
    sequence_id = df.loc[index, 'sequence_id']
    
    sorted_val = sorted_vals[0]
    
    columns, values = zip(*sorted_val)
    
    # Get the appropriate subplot from the axes
    ax = axes[idx]
    
    # Plot the values
    bars = ax.bar(columns, values, width=0.5)
    
    # Set the title as the sequence_id
    ax.set_title(sequence_id)
    
    # Rotate x-axis labels for better readability
    ax.set_xticklabels(columns, rotation=90)
    
    ax.set_ylabel('Probability')
    ax.grid(False)
    ax.set_ylim(0,1)
    
    # Add value annotations on top of each bar
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.3f}', ha='center', va='bottom')
    
    
# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()



In [None]:
# confusion matrix (less than 20 samples could be a good plot to use)
# Replace these lists with your true and predicted labels
true_labels = result['lab_of_origin']
predicted_labels = [i[0][0][0] for _, i in sorted_columns.iterrows()]

# Calculate the confusion matrix
cm = confusion_matrix(true_labels, predicted_labels)

# Set labels for the matrix
labels = np.unique(true_labels)
tick_labels = [f"True {label}" for label in labels]
col_labels = [f"Pred {label}" for label in labels]

# Create the heatmap
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', xticklabels=col_labels, yticklabels=tick_labels)

plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
df

In [None]:
# output / inference results
df_show = pd.DataFrame(df.iloc[0,:]).T

In [None]:
# next step: fake a sequence and see the results

### 2. CNN + (Attention) + Softmax
without UNK 

In [None]:
os.chdir('/cluster/home/yihliu/MLinPharma/capsule-3003146/code')

In [None]:
softmax_model_paths = list(sorted(glob("../data/output/4d1a64be-7826-4992-93b4-4e4beddb0c53/tensorboard_logs_csv_logs/*/checkpoints/*.ckpt")))
lab_index_mapping_paths = list(sorted(glob("../data/output/4d1a64be-7826-4992-93b4-4e4beddb0c53/tensorboard_logs_csv_logs/*/lab_index_mapping.pkl")))
from altlabs.model.conv1d_softmax_classifier import Conv1dSoftmaxClassifier, ModelConfig

In [None]:
seed_everything(350)

def predict_dataset(model: Conv1dAttnSoftmaxClassifier, dataset: FactorizationDataset, tta_steps: int) -> np.ndarray:
    batch_sampler = FasterBatchSampler(
        dataset, 32, shuffle=False,
    )

    model.to(device)
    predictions: List[List[float]] = []
    with torch.no_grad():
        for indices in batch_sampler:
            if tta_steps > 0:
                tta_predictions = []
                for i in range(tta_steps):
                    batch = dataset[indices]
                    if isinstance(batch[0], tuple):
                        (sequences, extra_inputs, _) = batch[
                            0
                        ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                    else:
                        (sequences, extra_inputs) = batch
                    outputs = torch.nn.functional.softmax(model(
                        sequences.to(device), extra_inputs.to(device)
                    )).tolist()
                    tta_predictions.append(np.array(outputs))
                predictions.extend(
                    np.mean(np.array(tta_predictions), axis=0).tolist()
                )
            else:
                batch = dataset[indices]
                if isinstance(batch[0], tuple):
                    (sequences, extra_inputs, _) = batch[
                        0
                    ]  # type: (torch.Tensor, torch.Tensor, torch.Tensor)
                else:
                    (sequences, extra_inputs) = batch
                outputs = torch.nn.functional.softmax(model(
                    sequences.to(device), extra_inputs.to(device)
                )).tolist()
                predictions.extend(outputs)

    return np.array(predictions)

sequence_index_mapping = create_index_mapping(
    "ATGC", include_unkown=True, include_none=False,
)
sequence_index_mapping["N"] = 0
input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns
output_columns = train_labels_df.drop(columns=["sequence_id"]).columns
occurrences = np.sum(train_labels_df[output_columns].values, axis=0)
minimum_occurrences = 2
filtered_out_output_columns = output_columns[
    occurrences < minimum_occurrences
]
output_columns = output_columns.drop(filtered_out_output_columns)


In [None]:
os.chdir('/cluster/home/yihliu/MLinPharma/capsule-3003146/data')


In [None]:
sample_size = 10
# sample_frac = 0.01
test_values_df_sampled = test_values_df.sample(sample_size,random_state=1)
fold_output = []

In [None]:
for softmax_model_path in softmax_model_paths:
    model = Conv1dSoftmaxClassifier.load_from_checkpoint(softmax_model_path)
        
    dataset = SoftmaxDataset(
        test_values_df_sampled,
        sequence_index_mapping,
        input_columns,
        transform_sequence_fn=transform_sequence_fn,
        test=True,
        bpe=True,
    )
    
    outputs = predict_dataset(model, dataset, 10)
    fold_output.append(outputs)
final_outputs = np.mean(fold_output, axis=0)


In [None]:
sequence_index_mapping["N"] = 0
input_columns = train_values_df.drop(columns=["sequence_id", "groups", "output"]).columns
output_columns = train_labels_df.drop(columns=["sequence_id"]).columns
occurrences = np.sum(train_labels_df[output_columns].values, axis=0)
minimum_occurrences = 2
filtered_out_output_columns = output_columns[
    occurrences < minimum_occurrences
]
filtered_out_output_columns = filtered_out_output_columns.append(pd.Index(["I7FXTVDP"]))
output_columns = output_columns.drop(filtered_out_output_columns)
df = pd.DataFrame(
    data=final_outputs, columns=output_columns, index=test_values_df_sampled["sequence_id"]
)

for column in filtered_out_output_columns:
    df[column] = 0.0
df = df[format_df.drop(columns=["sequence_id"]).columns]

In [None]:
test_set_sampled = test_set.iloc[test_values_df_sampled.index]
# generate softmax output for pub and private sub sets
df = df.reset_index()
df.index = test_values_df_sampled.index
pub_best_sub = df[df.index.isin(pub_index)]
private_best_sub = df[df.index.isin(private_index)]

In [None]:
# pub_best_sub = df.reset_index()[df.reset_index().index.isin(pub_index)]
# private_best_sub = df.reset_index()[df.reset_index().index.isin(private_index)]
private_test = test_set_sampled[test_set_sampled.index.isin(private_index)]
public_test = test_set_sampled[test_set_sampled.index.isin(pub_index)]
private_labels = private_test.drop(columns="sequence_id").values.argmax(axis=1)
public_labels = public_test.drop(columns="sequence_id").values.argmax(axis=1)


unk_eng_plasmids = test_set_sampled[test_set_sampled["I7FXTVDP"] == 1.0]["sequence_id"].tolist()
test_set_no_unk = test_set_sampled[~test_set_sampled["sequence_id"].isin(unk_eng_plasmids)]
# df = df.reset_index()
df_no_unk = df[~df["sequence_id"].isin(unk_eng_plasmids)]

pub_best_sub_no_unk = df_no_unk[df_no_unk.index.isin(pub_index)]
private_best_sub_no_unk = df_no_unk[df_no_unk.index.isin(private_index)]
private_test_no_unk = test_set_no_unk[test_set_no_unk.index.isin(private_index)]
public_test_no_unk = test_set_no_unk[test_set_no_unk.index.isin(pub_index)]
private_labels_no_unk = private_test_no_unk.drop(columns="sequence_id").values.argmax(axis=1)
public_labels_no_unk = public_test_no_unk.drop(columns="sequence_id").values.argmax(axis=1)



top_10_score_private_no_unk = top_k_accuracy_score(private_labels_no_unk, private_best_sub_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_private_no_unk = top_k_accuracy_score(private_labels_no_unk, private_best_sub_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))

top_10_score_public_no_unk = top_k_accuracy_score(public_labels_no_unk, pub_best_sub_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))

top_1_score_public_no_unk = top_k_accuracy_score(public_labels_no_unk, pub_best_sub_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))



top_10_score_total_no_unk = top_k_accuracy_score(test_set_no_unk.drop(columns="sequence_id").values.argmax(axis=1), df_no_unk.drop(columns="sequence_id").values, k=10, labels=range(0,1314))
top_1_score_total_no_unk = top_k_accuracy_score(test_set_no_unk.drop(columns="sequence_id").values.argmax(axis=1), df_no_unk.drop(columns="sequence_id").values, k=1, labels=range(0,1314))


display(f"Softmax model top-10 private score (no UNK): {top_10_score_private_no_unk}")

display(f"Softmax model top-1 private score (no UNK): {top_1_score_private_no_unk}")


display(f"Softmax model top-10 public score (no UNK): {top_10_score_public_no_unk}")

display(f"Softmax model top-1 public score (no UNK): {top_1_score_public_no_unk}")

display(f"Softmax model top-10 total score (no UNK): {top_10_score_total_no_unk}")

display(f"Softmax model top-1 total score (no UNK): {top_1_score_total_no_unk}")


In [None]:

if os.path.exists(f"{RESULTS_PATH}results.csv"):
    result_df = pd.read_csv(f"{RESULTS_PATH}results.csv")
else:
    result_df = pd.DataFrame([], columns=["Model", "Top 10 Score", "Top 1 Score", "Top 10 New Data Score", "Top 1 New Data Score", "Top 10 Total Score", "Top 1 Total Score"])
    

result_df = result_df.append({"Model": "Softmax model", 
                              "Top 10 Score": top_10_score_private, "Top 1 Score": top_1_score_private, 
                              "Top 10 New Data Score": top_10_score_public, "Top 1 New Data Score": top_1_score_public,
                             "Top 10 Total Score": top_10_score_total, "Top 1 Total Score": top_1_score_total}, ignore_index=True)

result_df = result_df.append({"Model": "Softmax model (no UNK)", 
                              "Top 10 Score": top_10_score_private_no_unk, "Top 1 Score": top_1_score_private_no_unk, 
                              "Top 10 New Data Score": top_10_score_public_no_unk, "Top 1 New Data Score": top_1_score_public_no_unk,
                             "Top 10 Total Score": top_10_score_total_no_unk, "Top 1 Total Score": top_1_score_total_no_unk}, ignore_index=True)

result_df.to_csv(f"{RESULTS_PATH}results.csv", index=False)