In [1]:
!which deepchain


In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [1]:
from biodatasets import list_datasets, load_dataset
from deepchain.models import MLP
from deepchain.models.utils import (
    confusion_matrix_plot,
    dataloader_from_numpy,
    model_evaluation_accuracy,
)
from sklearn.model_selection import train_test_split

import numpy as np
from sklearn import preprocessing

In [2]:
# load pfam dataset
pfam_dataset = load_dataset("pfam-32.0")
X, y = pfam_dataset.to_npy_arrays(input_names=["split"], target_names=["family_id"])

In [3]:
# get embeddings and filter on available embeddings
embeddings = pfam_dataset.get_embeddings("sequence", "protbert", "mean")
available_embeddings_len = len(embeddings)
print(f"We take only the first {available_embeddings_len} sequences as we have only their embeddings available.")
y = y[0][:available_embeddings_len]


We take only the first 1339083 sequences as we have only their embeddings available.


### Split data

In [4]:
import gc
gc.collect()

187

In [5]:
split = X[0]

In [6]:
emb_train = embeddings[split == 'train']
y_train = y[split == 'train']

In [7]:
emb_val = embeddings[split == 'dev']
y_val = y[split == 'dev']

In [8]:
len(emb_train)

1086741

In [9]:
len(emb_val)

126171

In [10]:
unique_classes = np.intersect1d(y_train, y_val)


In [11]:
# process targets
#unique_classes = np.unique(y)
num_classes = len(unique_classes)
print(f"There are {num_classes} unique classes for family_id.")


There are 13071 unique classes for family_id.


In [12]:
subset_classes = set(unique_classes)

In [13]:
#Train 
x_train_generator = (x for (x, y) in zip(emb_train, y_train) if y in subset_classes)
y_train_generator = (y for (x, y) in zip(emb_train, y_train) if y in subset_classes)

In [14]:
emb_train = list(x_train_generator)
y_train = list(y_train_generator)

In [15]:
len(emb_train), len(y_train)

(1064950, 1064950)

In [16]:
#Eval 
x_val_generator = (x for (x, y) in zip(emb_val, y_val) if y in subset_classes)
y_val_generator = (y for (x, y) in zip(emb_val, y_val) if y in subset_classes)
emb_val = list(x_val_generator)
y_val = list(y_val_generator)

In [17]:
len(y_val)

126171

In [18]:
le = preprocessing.LabelEncoder()
labels = le.fit(unique_classes)
targets_train = le.transform(y_train)
targets_val = le.transform(y_val)
print(f"Targets: {targets_train.shape}, {targets_train}, {len(labels.classes_)} classes")


Targets: (1064950,), [ 6015  3243 12995 ... 10421 11242   580], 13071 classes


In [19]:
from torch.utils.data import Dataset, DataLoader
import torch

In [20]:
class MyDataset(Dataset):
    def __init__(self, data, targets, transform=None):
        self.data = data
        self.targets = torch.LongTensor(targets)
        self.transform = transform

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]

        return x, y

    def __len__(self):
        return len(self.data)

In [21]:
train_dataset = MyDataset(data=emb_train, targets=targets_train)
val_dataset = MyDataset(data=emb_val, targets=targets_val)

In [22]:
train_dataloader = DataLoader(train_dataset, batch_size=2048)
test_dataloader = DataLoader(val_dataset, batch_size=2048)

In [23]:
next(iter(train_dataloader))[0].shape

torch.Size([2048, 1024])

In [24]:
next(iter(train_dataloader))[1].shape

torch.Size([2048])

In [25]:
import torch
import torch.nn.functional as F
from torch import nn

from deepchain.models.torch_model import TorchModel

from pytorch_lightning.metrics.functional import accuracy

In [26]:
class FamilyMLP(TorchModel):
    """Multi-layer perceptron model."""

    def __init__(self, input_shape: int = 768, output_shape: int = 1, **kwargs):
        super().__init__(**kwargs)
        self.output = nn.Softmax if output_shape > 1 else nn.Sigmoid
        self.loss = F.cross_entropy if output_shape > 1 else F.binary_cross_entropy
        self._model = nn.Sequential(
            nn.Linear(input_shape, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, output_shape)
        )

    def forward(self, x):
        """Defines forward pass"""
        if not isinstance(x, torch.Tensor):
            x = torch.tensor(x).float()
        return self._model(x)

    def training_step(self, batch, batch_idx):
        """training_step defined the train loop. It is independent of forward"""
        x, y = batch
        y_hat = self._model(x)
        y = y.long()
        #y = torch.unsqueeze(y, 1)
        loss = self.loss(y_hat, y)
        self.log("train_loss", loss, prog_bar=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self._model(x)
        y = y.long()
        
        loss = self.loss(y_hat, y)
        
        preds = torch.max(y_hat, dim=1)[1]
        acc = accuracy(preds, y)
        
        # Calling self.log will surface up scalars for you in TensorBoard
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        return loss

    def save_model(self, path: str):
        """Save entire model with torch"""
        torch.save(self._model, path)

In [27]:
mlp = FamilyMLP(input_shape=1024, output_shape=num_classes)


In [28]:
X_train.shape[1]

NameError: name 'X_train' is not defined

In [29]:
mlp


FamilyMLP(
  (_model): Sequential(
    (0): Linear(in_features=1024, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=256, out_features=13071, bias=True)
  )
)

In [30]:
mlp._model = torch.load("checkpoint/family_model.pt")

In [31]:
mlp.fit(train_dataloader, test_dataloader, epochs=10, auto_lr_find=True, auto_scale_batch_size=True, gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name   | Type       | Params
--------------------------------------
0 | _model | Sequential | 4.9 M 
--------------------------------------
4.9 M     Trainable params
0         Non-trainable params
4.9 M     Total params
19.744    Total estimated model params size (MB)


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]



Epoch 0:   0%|          | 0/582 [00:00<?, ?it/s]                      



Epoch 0:  89%|████████▉ | 520/582 [00:23<00:02, 22.41it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Validating: 0it [00:00, ?it/s][A
Epoch 0:  90%|████████▉ | 522/582 [00:23<00:02, 22.36it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  90%|█████████ | 525/582 [00:23<00:02, 22.39it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  91%|█████████ | 528/582 [00:23<00:02, 22.41it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  91%|█████████ | 531/582 [00:23<00:02, 22.44it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  92%|█████████▏| 534/582 [00:23<00:02, 22.46it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  92%|█████████▏| 537/582 [00:23<00:02, 22.48it/s, loss=0.748, v_num=5, val_loss=0.729, val_acc=0.852, train_loss=0.753]
Epoch 0:  93%|█████████▎| 540/582 [00:23<00:01, 22.51it/s, loss



In [34]:
mlp.save_model("family_model.pt")


In [33]:
!pwd

/home/jupyter/deepchain-app-pfam-32.0
Epoch 8:  22%|██▏       | 130/582 [00:23<01:20,  5.60it/s, loss=0.556, v_num=5, val_loss=0.663, val_acc=0.868, train_loss=0.547]

In [51]:
torch.max(mlp(next(iter(train_dataloader))[0]), 1)[1].shape

torch.Size([1048])

In [52]:
x, y = next(iter(train_dataloader))

In [53]:
torch.max(mlp(x), 1)[1] == y

tensor([True, True, True,  ..., True, True, True])

# Evaluation


In [54]:
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from torch.utils.data import DataLoader, TensorDataset
from typing import Callable, List, Tuple, Union




In [55]:
def model_evaluation_accuracy(
    dataloader: DataLoader, model
) -> Tuple[np.array, np.array]:
    """
    Make prediction for test data
    Args:
        dataloader: a torch dataloader containing dataset to be evaluated
        model : a callable trained model with a predict method
    """
    prediction, truth = [], []
    for X, y in dataloader:
        y_hat = torch.max(model.predict(X), 1)[1]
        prediction += y_hat
        truth += y.detach().numpy().flatten().tolist()

    prediction, truth = np.array(prediction), np.array(truth)

    acc_score = accuracy_score(truth, prediction)
    print(f" Test :  accuracy score : {acc_score:0.2f}")

    return prediction, truth

In [56]:
prediction, truth = model_evaluation_accuracy(train_dataloader, mlp)

 Test :  accuracy score : 0.83


In [60]:
prediction, truth = model_evaluation_accuracy(test_dataloader, mlp)

 Test :  accuracy score : 0.81


# Inference


In [58]:
le

LabelEncoder()

In [35]:
import joblib

joblib.dump(le, 'label_encoder.joblib')
label_encoder = joblib.load('label_encoder.joblib')
label_encoder

LabelEncoder()

In [394]:
def compute_scores(sequences: List[str]):
    """Return a list of all proteins score"""

    #x_embedding = self.transformer.compute_embeddings(sequences)["mean"]
    x_embedding = embeddings[:len(sequences)]
    
    y_hat = mlp(torch.tensor(x_embedding))
    preds = torch.max(y_hat, dim=1)[1]
    preds = preds.detach().cpu().numpy()
    
    family_preds = label_encoder.inverse_transform(preds)

    family_list = [{"family_id": family_pred} for family_pred in family_preds]

    return family_list

In [395]:
sequences = [
        "MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG",
        "KALTARQQEVFDLIRDHISQTGMPPTRAEIAQRLGFRSPNAAEEHLKALARKGVIEIVSGASRGIRLLQEE",
    ]
compute_scores(sequences)

[{'family_id': 'GMC_oxred_C'}, {'family_id': 'DUF2887'}]

In [272]:
# Start tensorboard.
%load_ext tensorboard
%tensorboard --logdir lightning_logs/