In [360]:
from biodatasets import list_datasets, load_dataset
from deepchain.models import MLP
from deepchain.models.utils import (
    confusion_matrix_plot,
   # dataloader_from_numpy,
    model_evaluation_accuracy,
)
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn import preprocessing

In [361]:
# load pfam dataset
pfam_dataset = load_dataset("pfam-32.0", force=True)
_, y = pfam_dataset.to_npy_arrays(input_names=["sequence"], target_names=["family_id"])

INFO: Start downloading pfam-32.0 dataset in biodatasets/.cache/pfam-32.0 ...
dataset.csv: 267MB [01:52, 2.49MB/s]                              
INFO: File pfam-32.0/dataset.csv downloaded from Google Bucket 'deepchain-datasets-public' at biodatasets/.cache/pfam-32.0/dataset.csv
description.md: 8.00kB [00:00, 21.5kB/s]
INFO: File pfam-32.0/description.md downloaded from Google Bucket 'deepchain-datasets-public' at biodatasets/.cache/pfam-32.0/description.md
info.json: 8.00kB [00:00, 21.4kB/s]
INFO: File pfam-32.0/info.json downloaded from Google Bucket 'deepchain-datasets-public' at biodatasets/.cache/pfam-32.0/info.json
sequence_protbert_mean_embeddings.npy: 781MB [04:00, 3.41MB/s]                               
INFO: File pfam-32.0/sequence_protbert_mean_embeddings.npy downloaded from Google Bucket 'deepchain-datasets-public' at biodatasets/.cache/pfam-32.0/sequence_protbert_mean_embeddings.npy


In [362]:

# get embeddings and filter on available embeddings
embeddings = pfam_dataset.get_embeddings("sequence", "protbert", "mean")
available_embeddings_len = len(embeddings)
print(f"We take only the first {available_embeddings_len} sequences as we have only their embeddings available.")
y = y[0][:available_embeddings_len]


We take only the first 200000 sequences as we have only their embeddings available.


In [363]:
# process targets
unique_classes = np.unique(y)
num_classes = len(unique_classes)
print(f"There are {num_classes} unique classes for family_id.")


There are 15415 unique classes for family_id.


In [364]:
le = preprocessing.LabelEncoder()
labels = le.fit(unique_classes)
targets = le.transform(y)
print(f"Targets: {targets.shape}, {targets}, {len(labels.classes_)} classes")


Targets: (200000,), [ 7066  3771 15329 ... 13940  1648  5782], 15415 classes


In [365]:
X_train, X_val, y_train, y_val = train_test_split(embeddings, targets, test_size=0.3)

train_dataloader = dataloader_from_numpy(X_train, y_train, batch_size=256)
val_dataloader = dataloader_from_numpy(X_val, y_val, batch_size=256)

In [366]:
next(iter(train_dataloader))[0].shape

torch.Size([256, 1024])

In [367]:
next(iter(train_dataloader))[1].shape

torch.Size([256])

In [368]:
import torch
import torch.nn.functional as F
from torch import nn

from deepchain.models.torch_model import TorchModel

from pytorch_lightning.metrics.functional import accuracy

In [369]:
class FamilyMLP(TorchModel):
    """Multi-layer perceptron model."""

    def __init__(self, input_shape: int = 768, output_shape: int = 1, **kwargs):
        super().__init__(**kwargs)
        self.output = nn.Softmax if output_shape > 1 else nn.Sigmoid
        self.loss = F.cross_entropy if output_shape > 1 else F.binary_cross_entropy
        self._model = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, output_shape)
        )

    def forward(self, x):
        """Defines forward pass"""
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x).float()
        return self._model(x)

    def training_step(self, batch, batch_idx):
        """training_step defined the train loop. It is independent of forward"""
        x, y = batch
        y_hat = self._model(x)
        y = y.long()
        #y = torch.unsqueeze(y, 1)
        loss = self.loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self._model(x)
        y = y.long()
        
        loss = self.loss(y_hat, y)
        
        preds = torch.max(y_hat, dim=1)[1]
        acc = accuracy(preds, y)
        
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def save_model(self, path: str):
        """Save entire model with torch"""
        torch.save(self._model, path)

In [370]:
mlp = FamilyMLP(input_shape=X_train.shape[1], output_shape=num_classes)


In [371]:
X_train.shape[1]

1024

In [372]:
mlp


FamilyMLP(
  (_model): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=256, out_features=15415, bias=True)
  )
)

In [None]:
mlp.fit(train_dataloader, val_dataloader, epochs=10, auto_lr_find=True, auto_scale_batch_size=True)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name   | Type       | Params
--------------------------------------
0 | _model | Sequential | 4.3 M 
--------------------------------------
4.3 M     Trainable params
0         Non-trainable params
4.3 M     Total params
17.159    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

In [354]:
torch.max(mlp(next(iter(train_dataloader))[0]), 1)[1].shape

torch.Size([256])

In [355]:
x, y = next(iter(train_dataloader))

In [356]:
torch.max(mlp(x), 1)[1] == y

tensor([False, False, False, False,  True,  True, False,  True,  True,  True,
         True,  True,  True, False,  True, False,  True,  True,  True,  True,
        False, False,  True,  True,  True, False,  True,  True, False,  True,
        False,  True, False,  True,  True, False,  True,  True, False, False,
         True,  True, False,  True, False,  True,  True, False, False, False,
         True,  True,  True,  True,  True,  True,  True, False,  True,  True,
        False,  True,  True, False, False,  True,  True, False,  True,  True,
        False,  True,  True, False,  True, False,  True,  True, False, False,
        False,  True,  True,  True,  True, False,  True,  True,  True,  True,
         True,  True,  True, False,  True,  True,  True,  True,  True, False,
        False,  True,  True,  True, False,  True, False,  True,  True,  True,
        False, False, False, False, False, False,  True, False,  True, False,
         True,  True,  True,  True, False,  True,  True,  True, 

# Evaluation


In [357]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from typing import Callable, List, Tuple, Union




In [358]:
def model_evaluation_accuracy(
    dataloader: DataLoader, model
) -> Tuple[np.array, np.array]:
    """
    Make prediction for test data
    Args:
        dataloader: a torch dataloader containing dataset to be evaluated
        model : a callable trained model with a predict method
    """
    prediction, truth = [], []
    for X, y in dataloader:
        y_hat = torch.max(model.predict(X), 1)[1]
        prediction += y_hat
        truth += y.detach().numpy().flatten().tolist()

    prediction, truth = np.array(prediction), np.array(truth)

    acc_score = accuracy_score(truth, prediction)
    print(f" Test :  accuracy score : {acc_score:0.2f}")

    return prediction, truth

In [359]:
prediction, truth = model_evaluation_accuracy(train_dataloader, mlp)

 Test :  accuracy score : 0.46


In [272]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/