## Make predictions using KNN

Load embeddings produced by trained ProtCNN model. Use KNN to make predictions on the test dataset.

In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import argparse
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
from protein_universe_annotate.data_processing import read_pfam_dataset

In [3]:
train_embeddings_path = '/content/drive/MyDrive/_ruslan_project/training_embeddings.npy'
test_embeddings_path = '/content/drive/MyDrive/_ruslan_project/testset_embeddings.npy'

In [4]:
train_embeddings = np.load(train_embeddings_path, allow_pickle=True)

In [5]:
test_embeddings = np.load(test_embeddings_path, allow_pickle=True)

In [8]:
data_partitions_dirpath = '../data/'
print('Available dataset partitions: ', os.listdir(data_partitions_dirpath))

Available dataset partitions:  ['test', 'train']


In [14]:
train_df = read_pfam_dataset('train')

In [15]:
test_df = read_pfam_dataset('test')

In [16]:
test_df = test_df.sort_values('sequence', key=lambda col: [len(c) for c in col])
train_df = train_df.sort_values('sequence', key=lambda col: [len(c) for c in col])

In [17]:
test_df['true_label'] = test_df.family_accession.apply(lambda s: s.split('.')[0])
train_df['true_label'] = train_df.family_accession.apply(lambda s: s.split('.')[0])

In [18]:
train_df['true_label'].shape

(1086741,)

In [19]:
train_embeddings.shape

(1086741, 1100)

In [20]:
train_labels = train_df['true_label'].values

In [21]:
test_labels = test_df['true_label'].values

In [22]:
len(train_labels) == len(train_embeddings)

True

As in paper - "for ProtREP, we compute the average learned representation for each family across its training sequences, yielding a sentinel family representation."

In [23]:
# Group train embeddings by label to calculate the mean(embeddings) for a label
label_grouped_embeddings = defaultdict(list)
for sample in zip(train_embeddings, train_labels):
    embed, label = sample[0], sample[1]
    label_grouped_embeddings[label].append(embed)

In [24]:
# Get label learned representation by taking the mean of the grouped embeddings
label_learned_representation = dict()
for label, embeddings in label_grouped_embeddings.items():
    label_learned_representation[label] = np.mean(embeddings, axis=0)

In [25]:
len(label_grouped_embeddings)

17929

In [26]:
label_grouped_embeddings['PF08260']

[array([-17.643982 , -36.26096  ,  18.781805 , ..., -22.631489 ,
        -46.473953 ,  -1.1414888], dtype=float32),
 array([  1.7974054, -10.713331 ,  13.108027 , ..., -39.11196  ,
        -51.62787  ,   3.652741 ], dtype=float32),
 array([-13.460838 , -33.77893  ,  -7.4814425, ..., -24.550732 ,
        -29.308111 , -23.83563  ], dtype=float32)]

In [27]:
label_learned_representation['PF08260']

array([ -9.769138, -26.91774 ,   8.13613 , ..., -28.764727, -42.46998 ,
        -7.108126], dtype=float32)

In [28]:
len(label_learned_representation)

17929

In [29]:
# Get labels and representations as lists - input to KNN
labels = list(label_learned_representation.keys())
representations = list(label_learned_representation.values())

In [30]:
representations[0]

array([  4.1850615,  12.385813 , -31.41682  , ..., -10.160907 ,
       -37.876625 ,  31.235952 ], dtype=float32)

In [31]:
sc = StandardScaler()
sc.fit(representations)
representations = sc.transform(representations)
test_embeddings = sc.transform(test_embeddings)

In [None]:
# pca = PCA(n_components=target_dim)
# representations = pca.fit_transform(representations)
# test_embeddings = pca.transform(test_embeddings)

In [32]:
knn = KNeighborsClassifier(n_neighbors=10, n_jobs=-1)

In [33]:
knn.fit(representations, labels)

In [34]:
%%time
preds = knn.predict(test_embeddings)

CPU times: user 6min 40s, sys: 28.4 s, total: 7min 9s
Wall time: 5min 27s


In [35]:
test_df[f'predicted_label_KNN'] = preds

In [None]:
# Save predictions
test_df.to_csv('/content/test_preds_KNN.csv', index=True)