In [2]:
from DataLoading import Dataloader
import graph_networks
import model_helper
import graph_networks_gen

## Topology / Structure-based Protein Design

### Database: SCOPe

**For Discriminative Tasks:**

|            | class | fold | super-family | family | protein |
| ---        | ---   | ---  | ---          | ---    |  ---    |
| Training   | 6     | 1080 | 1820         | 4304   | 40082   |
| Validation | 6     | 771  | 1232         | 2705   | 10069   |
| Test       | 6     | 902  | 1480         | 3373   | 10737   |
| All        | 6     | 1080 | 1820         | 4304   | 60888   |

**For Generative Task**

|            | class | fold | super-family | family | protein |
| ---        | ---   | ---  | ---          | ---    |  ---    |
| Training   | 6     | 870  | 1367         | 3022   | 39979   |
| Validation | 6     | 131  | 276          | 678    | 10678   |
| Test       | 6     | 152  | 259          | 662    | 10231   |
| All        | 6     | 1080 | 1820         | 4304   | 60888   |

### Load the data

In [2]:
train_set, vali_set, test_set = Dataloader()
#train_set, vali_set, test_set = Dataloader(database = 'SCOPe_debug', 
#                                           path = '../Datasets/SCOPe/', 
#                                           task = 'Discriminative', 
#                                           batch_size = 16,)

The database SCOPe_debug has already been downloaded.

Database: SCOPe_debug
Task: Discriminative
Shuffle: True False False
training: 55 samples
validation: 55 samples
test: 59 samples
Batch size: 16


### Define the model

In [3]:
model_dis = graph_networks.GraphLevelEmbedding(CUDA=False)

In [4]:
model_gen = graph_networks_gen.VAE_Container(CUDA=False)

### Train the model

**Discriminative**

In [5]:
model, optimizer, results_all = model_helper.discriminative_train(train_set, model_dis, num_epochs = 3, 
                                                       val_dataset=vali_set, test_dataset=test_set)

Epoch:  1
Average loss: 6.992451
Training accuracy: 0.0182
Validation accuracy: 0.0000
Test accuracy: 0.0000
Training time for Epoch 1: 0.7060 s
Total time for Epoch 1: 1.3040 s
Epoch:  2
Average loss: 6.500510
Training accuracy: 0.0182
Validation accuracy: 0.0000
Test accuracy: 0.0000
Training time for Epoch 2: 0.5852 s
Total time for Epoch 2: 1.2051 s
Epoch:  3
Average loss: 4.776617
Training accuracy: 0.0727
Validation accuracy: 0.0000
Test accuracy: 0.0000
Training time for Epoch 3: 0.6079 s
Total time for Epoch 3: 1.1639 s
Best training result: 0.0727 (epoch 3)
Best validation result: 0.0000 (epoch 0)
Best test result: 0.0000 (epoch 0)


**Generative**

In [5]:
model_gen, optimizer_gen = model_helper.VAE_training(model_gen, train_set, Epoch_NUM = 5)

Epoch 1:
Average-Loss: 2.9803	Average-CE: 2.9802	Average-KLD: 0.3185
Training time: 5.8487s
Epoch 2:
Average-Loss: 2.7985	Average-CE: 2.7980	Average-KLD: 0.3198
Training time: 6.9002s
Epoch 3:
Average-Loss: 2.6635	Average-CE: 2.6625	Average-KLD: 0.3260
Training time: 7.0257s
Epoch 4:
Average-Loss: 2.5690	Average-CE: 2.5676	Average-KLD: 0.3397
Training time: 6.6755s
Epoch 5:
Average-Loss: 2.4864	Average-CE: 2.4846	Average-KLD: 0.3593
Training time: 7.5626s


### Evaluation

In [15]:
import evaluation
import numpy as np
import torch

In [22]:
ele_all, seq_all, iden_list, ppl_list = evaluation.Gen_evaluation(model_gen, train_set)
print('The perplexity is %.4f.'%(float(torch.mean(torch.Tensor(ppl_list)))))
print('The average sequence identity is %.4f.'%(np.mean(iden_list)))
print()
print('Examples of the generated sequences:')
for s in  seq_all[:10]:
    print(s)

The perplexity is 13.7019.
The average sequence identity is 0.0393.

Examples of the generated sequences:
AMASRQQQRVVSVVLLLSLVVSSSSLVSVVVVSVSSLVLVVLVLQNELL
KCKSLSEDPKPYYYANYNY
LLLSSAYGENGGGHLR
NVVVSSSVSSSHSSVSSVHHVHHVSSHVHVVVVHSVNQPPAA
EKLAHHHAALINLSLSKLKLSLKKSLSKKKKKKKSSSSSSKSSLSKSA
QLQRLQQQQQRRRQRRRQLRRQRQRRRLRRRRRLQKDVDKVKKDDDKVVDVKVVVKVVVVVKVDKVVDDVDLDKQQQDDDDQDNTAE
KEKKPTTLA
AHHAAAFFPREREPAAPAPALRLLLKD
HHTEVQREREREQEEQRRERQRRQQQERQQEQREQRQQEQLA
LLLNEPSSSGGKLVTK


In [20]:
ele_all, seq_all, iden_list, ppl_list = evaluation.Gen_evaluation(model_gen, test_set)
print('The perplexity is %.4f.'%(float(torch.mean(torch.Tensor(ppl_list)))))
print('The average sequence identity is %.4f.'%(np.mean(iden_list)))
print()
print('Examples of the generated sequences:')
for s in  seq_all[:10]:
    print(s)

The perplexity is 13.6997.
The average sequence identity is 0.0437.

Examples of the generated sequences:
RLRKGGKLDDNLLLDNLNDNLDNNLDNNDLNDLNLNNDDNNLTKKKNHQQQHHHA
ENEANNENREERREHYYLAHKAKHHAHHAAKHAHAHKAHHHKAKKKHAHHKHKLDALKK
NPQKKQKKRPRPPPRLKKLKLLLLPPVKKPPKPK
LNLNTTCNNCDDDDCLPQSPAKHHAVAVATAVTTAAATTVAAAVAVVTVATVTTAVVTTLLLK
KKAASSSASSSKKYYYNLLTTLLLLLSTTLLLLTTSTLTSTSLTTSSTSSLLANNNKPANRN
YYYYYYYYNNKNKYKNYYNKYNKYYNYKKNYNKYYAYKKFPGGGSLSLK
TTTSSYEPPPLLGLPPPLGPGPGGPLPPGGGLLPLLLPGPGLPLLLAAKRRR
QQQAPPPLPLPLPLLCPCKRRKKMMKKAAAMAMMMMAKMMKAAKMKMKKKAMAMKKVVHPPPTETLKP
FLFFFLLLALLPPPLGAASSSSSSSK
VTTVKVKTKVTKKKKTTVKVTTKTKTTTTTTKVVKKPKKGGTFCQAAALL


## Antibody Design

### Database: SAbDab, AbDb, CoV3D

**For Pretraining:**

|            | cluster | pdb  | complex |
| ---        | ---     | ---  | ---     | 
| Training   | 791     | 1189 | 2083    |
| Validation | 179     | 255  | 427     |
| Test       | 169     | 312  | 511     |   

**For Finetuning:**

|            | cluster | pdb  | complex |
| ---        | ---     | ---  | ---     | 
| Training   | 103     | 136 | 237    |
| Validation | 27     | 47  | 77    |
| Test       | 22     | 32  | 62     |   

In [10]:
import networks
import training_helper
import evaluation_helper

In [4]:
model_anti = networks.CDR_Generator(USE_CUDA = False)
data_path = '../Datasets/Antibody/LoaderData_EGCN_Pretrain_Training_small.pickle'

In [5]:
model_anti, optimizer = training_helper.model_training(model_anti, data_path, 
                                                       Epoch_end = 3, batch_size = 2, USE_CUDA = False)

Data loading...
4 clusters and 10 samples loaded.
Training...
Epoch 1:
tensor(2.9997, grad_fn=<NllLossBackward0>)
tensor(2.7646, grad_fn=<NllLossBackward0>)
Average-Loss: 2.8821  Training time: 253.0475s
Epoch 2:
tensor(2.6835, grad_fn=<NllLossBackward0>)
tensor(2.7116, grad_fn=<NllLossBackward0>)
Average-Loss: 2.6975  Training time: 250.1725s
Epoch 3:
tensor(2.7010, grad_fn=<NllLossBackward0>)
tensor(2.6060, grad_fn=<NllLossBackward0>)
Average-Loss: 2.6535  Training time: 255.3047s
Training (3 epochs and 6 iterations) completed!


In [11]:
seq_dict, result_dict = evaluation_helper.model_evaluation(model_anti, data_path, 
                                                           batch_size = 2, USE_CUDA = False)

Data loading...
10 sample loaded.
Evaluating...
Epoch 1:
CDR seq: [['WSLQSFTQ', 'CTMQAFSG', 'YRSNKRTLEMGYSTNW', 'SPRGTGTGR', '', 'VLWKAEVPTS'], ['HYSDVSLA', 'LYAQAKLS', 'NDTLGRGVSNPQFRAY', 'QIRCALGWD', '', 'CYYRDEDYYY']]
Cross_entropy: 2.6960 	Perplexity: 14.8205 	AAR: 0.1208
CDR seq: [['KFHCDYTP', '', 'HLYFLSSYYMHQDQ', 'MIQRTIYTH', 'YEG', ''], ['', '', '', 'WPMRTRGLKAVR', 'FMS', 'TKWCYYTN']]
Cross_entropy: 2.6410 	Perplexity: 14.0273 	AAR: 0.0859
CDR seq: [['', 'KRSKYSDL', 'WTLYGRNK', 'FTGAHFFNRWKT', 'LTK', 'LDRMYWFA'], ['', '', 'FQWGRTKGRIYN', 'SGQFPYIISYNI', 'MRR', 'LCWKRQRK']]
Cross_entropy: 2.6188 	Perplexity: 13.7198 	AAR: 0.0324
CDR seq: [['', '', 'RPWEFQWSQVDT', 'QYATECTSEYSC', 'SLM', 'LSAWEGIA'], ['', '', 'AENQNMTLTCNW', 'TLVRIGESWNMA', 'DTA', 'RNLNQHDN']]
Cross_entropy: 2.5933 	Perplexity: 13.3733 	AAR: 0.0312
CDR seq: [['', '', 'WDQIPSRIVMYF', 'KKTGSQYVEKQS', 'YTR', 'MTYLVYQH'], ['', '', 'RDWTWMTKFATS', 'KEENNGKVNYSV', 'NNF', 'RLGWDSKC']]
Cross_entropy: 2.5929 	Perplexity: 1

In [9]:
test_path = '../Datasets/Antibody/LoaderData_EGCN_Pretrain_Test_small.pickle'
seq_dict, result_dict = evaluation_helper.model_evaluation(model_anti, test_path, 
                                                           batch_size = 2, USE_CUDA = False)

Data loading...
10 sample loaded.
Evaluating...
Epoch 1:
CDR seq: [['SNSSSKVL', 'LDSATTM', 'AYGGVLHAWLPTLYDKEDSHA', 'PRQYGNHRRCLS', '', 'ANYRCSAMC'], ['GSELTQTL', 'NQHYQRAW', 'ASTSALCDTV', 'WPKSR', 'VIM', 'HRQGQMCLGGL']]
Cross_entropy: 2.8677 	Perplexity: 17.5969 	AAR: 0.0827
CDR seq: [['GMRPWNKL', 'YYFAKDSD', 'KRWQYYWYHS', 'SATTS', 'SGN', 'VESPACSRYLN'], ['PTYRWQDKVM', 'AENKTYILR', 'MRKPDWLAKLSYVQII', 'IDLQSRN', '', 'NSLPGYSSA']]
Cross_entropy: 2.7381 	Perplexity: 15.4582 	AAR: 0.1159
CDR seq: [['NSAWVHVENG', 'SRGIKGWTS', 'PMFKTPKWRVVPSRST', 'CMMQWSW', '', 'KMVNSYMLT'], ['', 'KYKYWFHK', 'TQRLSYRGEAIIQLSQYN', 'FSDDES', 'GWN', 'NCYLLSYGYFD']]
Cross_entropy: 2.8742 	Perplexity: 17.7120 	AAR: 0.1052
CDR seq: [['', 'RTNWRMAY', 'ANVFVRCMYLRMSCYVTC', 'YTRYNS', 'DLL', 'IQLSGYTCI'], ['', 'RYQPMKGQ', 'AWYACFNMCVLLKAWQPD', 'DHAHPF', 'YMR', 'KSRHISKVT']]
Cross_entropy: 2.8385 	Perplexity: 17.0902 	AAR: 0.0380
CDR seq: [['RYGETESP', '', 'HEHMFYYWTTVLS', 'MSLYDKYRSA', '', 'SYAYKRSAA'], ['THKNSYLL',