In [1]:
from DataLoading import Dataloader
import networks
import training_helper
import evaluation_helper



# Topology / Structure-based Protein Design

### Database: SCOPe

**For Discriminative Tasks:**

|            | class | fold | super-family | family | protein |
| ---        | ---   | ---  | ---          | ---    |  ---    |
| Training   | 6     | 1080 | 1820         | 4304   | 40082   |
| Validation | 6     | 771  | 1232         | 2705   | 10069   |
| Test       | 6     | 902  | 1480         | 3373   | 10737   |
| All        | 6     | 1080 | 1820         | 4304   | 60888   |

**For Generative Task**

|            | class | fold | super-family | family | protein |
| ---        | ---   | ---  | ---          | ---    |  ---    |
| Training   | 6     | 870  | 1367         | 3022   | 39979   |
| Validation | 6     | 131  | 276          | 678    | 10678   |
| Test       | 6     | 152  | 259          | 662    | 10231   |
| All        | 6     | 1080 | 1820         | 4304   | 60888   |

## Discriminative Task
**Test the node or graph-wise graph embedding models.**

### Load the data

**Utilize the dataloader as the following shows. For the first time the dataloader will directly download the processed data from the website. For the following attempts it will diretly load the downloaded data.**

In [2]:
train_set, vali_set, test_set = Dataloader(database = 'SCOPe', 
                                           path = '../Datasets/SCOPe/', 
                                           task = 'Discriminative', 
                                           batch_size = 16)

Downloading the database...
Downloading 1wc9o9p7nyg8s95_MO-UI80qqKVoSf4L9 into ../Datasets/SCOPe/SCOPe.zip... Done.
Unzipping...Done.

Database: SCOPe
Task: Discriminative
Shuffle: True False False
training: 40082 samples
validation: 10069 samples
test: 10737 samples
Batch size: 16


**For simiplicity here we only load part of the dataset. For the challenge please apply the complete dataset (database = 'SCOPe_debug').**

In [3]:
train_set, vali_set, test_set = Dataloader(database = 'SCOPe_debug', 
                                           path = '../Datasets/SCOPe/', 
                                           task = 'Discriminative', 
                                           batch_size = 16)

Downloading the database...
Downloading 1BFsBdQzLiRKmc1lDOZBwREcCnfg4EiRU into ../Datasets/SCOPe/SCOPe_debug.zip... Done.
Unzipping...Done.

Database: SCOPe_debug
Task: Discriminative
Shuffle: True False False
training: 55 samples
validation: 55 samples
test: 59 samples
Batch size: 16


### Define the model (user defined)

**Firstly users need to define their own GNN, and then they can take the class *GraphLevelEmbedding* as a container for their model. Then they can do the discriminative task following our pipeline. The GNN can only take the feature vector (max_num_of_nodes x feat_dim) and the adjacency tensor (channel_num x max_num_of_nodes x max_num_of_nodes). In this task max_num_of_nodes = 60, feat_dim = 11; channel_num = 5 for heterogeneous graph and 1 for heterogenous graph. The GNN can be either node-wise or graph-wise.**

In [8]:
### a node-wise embedding graph (the part need to be defined by users)
gnn = networks.GraphConvolNetwork(feature_dim = 11, hidden_dim = 100, embedding_dim = 20, 
                                  num_layers = 3, channel_num=5)
# This is an illustration implementation of the GCN and the inputs are not necessary, 
# but the input feature dimension must be 11.

In [12]:
### The container of the GNN. 
model_dis = networks.GNN_Container(model = gnn, embedding_dim = 20, 
                                   pooling = 'max', CUDA = False, channel_num=5)
# embedding dim x channel_num = output dimension of the defined GNN
# For node-wise GNN, "pooling" can be 'max', 'sum' or 'mean'; for graph-wise GNN "pooling" need to be set as None.

### Train the model

In [14]:
model, optimizer, results_all = training_helper.discriminative_train(train_set, # training set
                                                                     model_dis, # model 
                                                                     num_epochs = 3, # amount of epochs
                                                                     val_dataset=vali_set, # val set (optional)
                                                                     test_dataset=test_set, # test set (optional)
                                                                     heterogeous = True, # False for homogeneous graph
                                                                     USE_CUDA = False)

Epoch:  1


OSError: [Errno 12] Cannot allocate memory

## Generative Task
**Test node or graph-wise graph embedding models, language embedding models or language generative models.**

### Define the model

In [3]:
import torch.nn as nn

In [4]:
rnn = nn.LSTM(21, 64, num_layers = 3, dropout=0.1, 
              bidirectional=True, batch_first = True)
model_gen = networks.VAE_Container(language_model = rnn, CUDA=False)

AttributeError: 'GcnEncoderGraph' object has no attribute 'out_dim'

### Train the model

In [5]:
model_gen, optimizer_gen = training_helper.VAE_training(model_gen, train_set, Epoch_NUM = 5)

Epoch 1:
Average-Loss: 2.9803	Average-CE: 2.9802	Average-KLD: 0.3185
Training time: 5.8487s
Epoch 2:
Average-Loss: 2.7985	Average-CE: 2.7980	Average-KLD: 0.3198
Training time: 6.9002s
Epoch 3:
Average-Loss: 2.6635	Average-CE: 2.6625	Average-KLD: 0.3260
Training time: 7.0257s
Epoch 4:
Average-Loss: 2.5690	Average-CE: 2.5676	Average-KLD: 0.3397
Training time: 6.6755s
Epoch 5:
Average-Loss: 2.4864	Average-CE: 2.4846	Average-KLD: 0.3593
Training time: 7.5626s


### Evaluation

In [4]:
import numpy as np
import torch

In [22]:
ele_all, seq_all, iden_list, ppl_list = evaluation_helper.Gen_evaluation(model_gen, train_set)
print('The perplexity is %.4f.'%(float(torch.mean(torch.Tensor(ppl_list)))))
print('The average sequence identity is %.4f.'%(np.mean(iden_list)))
print()
print('Examples of the generated sequences:')
for s in  seq_all[:10]:
    print(s)

The perplexity is 13.7019.
The average sequence identity is 0.0393.

Examples of the generated sequences:
AMASRQQQRVVSVVLLLSLVVSSSSLVSVVVVSVSSLVLVVLVLQNELL
KCKSLSEDPKPYYYANYNY
LLLSSAYGENGGGHLR
NVVVSSSVSSSHSSVSSVHHVHHVSSHVHVVVVHSVNQPPAA
EKLAHHHAALINLSLSKLKLSLKKSLSKKKKKKKSSSSSSKSSLSKSA
QLQRLQQQQQRRRQRRRQLRRQRQRRRLRRRRRLQKDVDKVKKDDDKVVDVKVVVKVVVVVKVDKVVDDVDLDKQQQDDDDQDNTAE
KEKKPTTLA
AHHAAAFFPREREPAAPAPALRLLLKD
HHTEVQREREREQEEQRRERQRRQQQERQQEQREQRQQEQLA
LLLNEPSSSGGKLVTK


In [20]:
ele_all, seq_all, iden_list, ppl_list = evaluation.Gen_evaluation(model_gen, test_set)
print('The perplexity is %.4f.'%(float(torch.mean(torch.Tensor(ppl_list)))))
print('The average sequence identity is %.4f.'%(np.mean(iden_list)))
print()
print('Examples of the generated sequences:')
for s in  seq_all[:10]:
    print(s)

The perplexity is 13.6997.
The average sequence identity is 0.0437.

Examples of the generated sequences:
RLRKGGKLDDNLLLDNLNDNLDNNLDNNDLNDLNLNNDDNNLTKKKNHQQQHHHA
ENEANNENREERREHYYLAHKAKHHAHHAAKHAHAHKAHHHKAKKKHAHHKHKLDALKK
NPQKKQKKRPRPPPRLKKLKLLLLPPVKKPPKPK
LNLNTTCNNCDDDDCLPQSPAKHHAVAVATAVTTAAATTVAAAVAVVTVATVTTAVVTTLLLK
KKAASSSASSSKKYYYNLLTTLLLLLSTTLLLLTTSTLTSTSLTTSSTSSLLANNNKPANRN
YYYYYYYYNNKNKYKNYYNKYNKYYNYKKNYNKYYAYKKFPGGGSLSLK
TTTSSYEPPPLLGLPPPLGPGPGGPLPPGGGLLPLLLPGPGLPLLLAAKRRR
QQQAPPPLPLPLPLLCPCKRRKKMMKKAAAMAMMMMAKMMKAAKMKMKKKAMAMKKVVHPPPTETLKP
FLFFFLLLALLPPPLGAASSSSSSSK
VTTVKVKTKVTKKKKTTVKVTTKTKTTTTTTKVVKKPKKGGTFCQAAALL
