In [1]:
from integrated import generate_training_dataset_from_csv, generate_training_dataset_from_excel, antibody_cls, antibody_cls_from_BLADE, train_rf, test_rf, train_NN, test_NN

## generate training datasets from raw data

You may need to change the column name in excel or csv to avoid error, change the column name of binders read to "Bind", the column name of display reads to "Display".  <br>
If experiments have four gates, csv file need 'Bin1','Bin2','Bin3','Bin4' as reads column names.  <br>
It can handle dataset designed for BLADE, the dataset should have "Observations" and "Response" columns, and has the parent sequence in the first row. Just need to use antibody_cls_from_BLADE function when creating antibody objects.

In [2]:
name_list = ["HC", "LC"]
for name in name_list:
    for i in range(3):
        generate_training_dataset_from_excel("ALK7/{}.xlsx".format(name), "AAsequence", "ALK7/{}_{}_training_datasets".format(name, i), sheet_name=i)  
name_list = ['35A6_HC','35A6_LC','35B6_HC','35B6_LC']
for name in name_list:
    generate_training_dataset_from_csv("CCR8/{}.csv".format(name), "AAsequence", "CCR8/{}_training_datasets".format(name))
name_list = ["LTBR_HC", "LTBR_LC"]
for name in name_list:
    generate_training_dataset_from_csv("LTBR/{}.csv".format(name), "Sequence", "LTBR/{}_training_datasets".format(name))

## create antibody objects from training datasets

In [3]:
antibody = {}

name_list = ["HC_0", "HC_1", "HC_2", "LC_0", "LC_1", "LC_2"]
for name in name_list:
    antibody['ALK7_'+name] = antibody_cls("ALK7/{}_training_datasets".format(name), mutation_count_column='mutation_count', name=name)
    antibody['ALK7_'+name].generate_training_array()
    
name_list = ['35A6_HC','35A6_LC','35B6_HC','35B6_LC']
for name in name_list:
    antibody['CCR8_'+name] = antibody_cls("CCR8/{}_training_datasets".format(name), mutation_count_column='mutation_count', bin_num=4, name=name)
    antibody['CCR8_'+name].generate_training_array()

name_list = ["LTBR_HC", "LTBR_LC"]
for name in name_list:
    antibody[name] = antibody_cls_from_BLADE("LTBR/{}_training_datasets".format(name), name=name)
    antibody[name].generate_training_array()

## train random forest or neural network using antibody objects

### creata a list of antibody

In [4]:
train_name_list = ['ALK7_HC_0', 'ALK7_HC_1', 'ALK7_HC_2', 'ALK7_LC_0', 'ALK7_LC_1', 'ALK7_LC_2',
                'CCR8_35A6_HC','CCR8_35A6_LC','CCR8_35B6_HC','CCR8_35B6_LC' #"LTBR_HC", "LTBR_LC"
                ]
antibody_train_list = [antibody[key] for key in train_name_list] 

test_name_list = ["LTBR_HC", "LTBR_LC"]
antibody_test_list = [antibody[key] for key in test_name_list]

### train random forest without hyperparameter tuning
The function uses class weights and a max depth of 20

In [5]:
clf = train_rf(antibody_train_list)

### train random forest with hyperparameter tuning

In [6]:
parameters={'max_depth': [10, 20, 50]}

clf_ht = train_rf(antibody_train_list, tune_hyper=True, parameters=parameters)

### test random forest model

In [7]:
test_rf(clf, antibody_test_list)

F1_score for [lower affinity, maintain or increase] class: [0.92032222 0.26607145]


### train neural network with shuffle and tune the architecture

Instead of returning a classifier, this function stores everything in 'metrics_NN/', 'tensorboard_NN/', 'Checkpoints_NN/' and 'Models/' folders. 

(The following cell uses a list of [512], num_shuffle=1 for a shorter computation time. )

In [9]:
for N_nodes in [512]:
    train_NN(antibody_list=antibody_train_list, record_name='{}_lr.003_batch10000'.format(N_nodes), 
             batch_size=10000, lr=.003, NN_architecture=[1280,N_nodes,2], num_shuffle=1)

epoch 70, test_F1 = 0.6684
epoch 140, test_F1 = 0.6950
epoch 210, test_F1 = 0.6971
epoch 280, test_F1 = 0.6954


### test neural network model

load '512_lr.003_batch10000_r1' model and test ("_r1" is generated from the first shuffle)

In [11]:
test_NN('512_lr.003_batch10000_r1.pth', antibody_test_list)

F1: 0.280
Precision: 0.232
Recall: 0.351
