# Training

In [38]:
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, average_precision_score, mean_squared_error, r2_score


import xgboost as xgb

pd.set_option('display.max_columns', None)

In [39]:
class CFG:
    FOLDS = 5
    PROTEIN = ['BRD4', 'HSA', 'sEH']
    target = ['binds']
    eval_metric = 'rmse'
    # standard = True
    standard = False
    
    output_path = Path('../../models/xgb_2_dict.pickle')
    # is_output = True
    is_output = False

Load

In [40]:
df = pd.read_pickle('../../data/processed/ecfp_60000_50per.pkl')
df.head()

Unnamed: 0,id,buildingblock1_smiles,buildingblock2_smiles,buildingblock3_smiles,molecule_smiles,protein_name,binds,molecule,ecfp,protein_BRD4,protein_HSA,protein_sEH,fold,any_binds
0,0,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x712c055ca390>,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1,0
1,1,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x712c047f2340>,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,1,0
2,2,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.Br.NCC1CCCN1c1cccnn1,C#CCOc1ccc(CNc2nc(NCC3CCCN3c3cccnn3)nc(N[C@@H]...,sEH,0,<rdkit.Chem.rdchem.Mol object at 0x712c047f22f0>,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,4,0
3,3,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,BRD4,0,<rdkit.Chem.rdchem.Mol object at 0x712c25de9620>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,2,0
4,4,C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21,C#CCOc1ccc(CN)cc1.Cl,Br.NCc1cccc(Br)n1,C#CCOc1ccc(CNc2nc(NCc3cccc(Br)n3)nc(N[C@@H](CC...,HSA,0,<rdkit.Chem.rdchem.Mol object at 0x712c047f22a0>,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0,0


In [41]:
df_BRD4	= df[df['protein_BRD4']==1]
df_HSA	= df[df['protein_HSA']==1]
df_sEH	= df[df['protein_sEH']==1]

train_df_dict = dict(zip(CFG.PROTEIN, [df_BRD4, df_HSA, df_sEH]))

print(f'shape BRD4: {df_BRD4.shape}')
print(f'shape HSA: {df_HSA.shape}')
print(f'shape sEH: {df_sEH.shape}')

shape BRD4: (20617, 14)
shape HSA: (21913, 14)
shape sEH: (17470, 14)


# Train

In [47]:
def train_one_fold(val_fold, train_df):
    # X = pd.DataFrame([ecfp for ecfp in train_df['ecfp'].tolist()], dtype = np.uint8)
    X_train = pd.DataFrame(train_df[train_df['fold']!=val_fold]['ecfp'].apply(pd.Series), 
                     dtype=np.uint8)
    X_val = pd.DataFrame(train_df[train_df['fold']==val_fold]['ecfp'].apply(pd.Series), 
                     dtype=np.uint8)
    # display(X_train.head())
    y_train = train_df[train_df['fold']!=val_fold][CFG.target].astype(np.uint8)
    y_val = train_df[train_df['fold']==val_fold][CFG.target].astype(np.uint8)
    # display(y_val.head())
    # display(y_val.tail())
    # print(type(y_val))
    # display(y_val['binds'].value_counts())
    
    # Standardization
    if CFG.standard:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)
        
    # display(X_train[:5])
    # return
    
    # model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model = xgb.XGBRegressor(
        eval_metric=CFG.eval_metric
    )
    model.fit(X_train, y_train, eval_set=[(X_train, y_train)], verbose=10)
    
    # Prediction
    y_pred = model.predict(X_val)
    y_pred = np.maximum(y_pred, 0)
    display(y_pred[:20])
    
    # Evaluation
    # display(y_val)
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'R-squared: {r2}')
    
    return model, r2

def train_xgb():
    best_models = {}
    for protein in CFG.PROTEIN:
        print(f'protein : {protein}')
        train_df = train_df_dict[protein]
        models = []
        scores = []
        for i in range(CFG.FOLDS):
            print(f'Fold : {i}')
            XModel, map_score = train_one_fold(i, train_df)
            models.append(XModel)
            scores.append(map_score)
        best_index = scores.index(max(scores)) # if same value, choose first one
        best_models[protein] = (models[best_index], f'{CFG.eval_metric} : {scores[best_index]}')
        
    return best_models
        

In [48]:
train_one_fold(0, df_BRD4)

[0]	validation_0-rmse:0.36803
[10]	validation_0-rmse:0.07784
[20]	validation_0-rmse:0.06191
[30]	validation_0-rmse:0.05520
[40]	validation_0-rmse:0.04888
[50]	validation_0-rmse:0.04535
[60]	validation_0-rmse:0.04183
[70]	validation_0-rmse:0.03873
[80]	validation_0-rmse:0.03682
[90]	validation_0-rmse:0.03441
[99]	validation_0-rmse:0.03319


array([0.        , 0.        , 0.        , 0.00180161, 0.00445913,
       0.        , 0.        , 0.        , 0.00173588, 0.        ,
       0.00091873, 0.00199387, 0.        , 0.        , 0.02866195,
       0.        , 0.        , 0.00041463, 0.00209314, 0.0034024 ],
      dtype=float32)

Mean Squared Error: 0.0060688164085149765
R-squared: 0.9756597955401197


(XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='rmse', feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...),
 0.9756597955401197)

In [44]:
best_models = train_xgb()
display(best_models)

protein : BRD4
Fold : 0
[0]	validation_0-rmse:0.36803
[10]	validation_0-rmse:0.07784
[20]	validation_0-rmse:0.06191
[30]	validation_0-rmse:0.05520
[40]	validation_0-rmse:0.04888
[50]	validation_0-rmse:0.04535
[60]	validation_0-rmse:0.04183
[70]	validation_0-rmse:0.03873
[80]	validation_0-rmse:0.03682
[90]	validation_0-rmse:0.03441
[99]	validation_0-rmse:0.03319
Mean Squared Error: 0.0060688164085149765
R-squared: 0.9756597955401197
Fold : 1
[0]	validation_0-rmse:0.36731
[10]	validation_0-rmse:0.07553
[20]	validation_0-rmse:0.06095
[30]	validation_0-rmse:0.05380
[40]	validation_0-rmse:0.04873
[50]	validation_0-rmse:0.04446
[60]	validation_0-rmse:0.04115
[70]	validation_0-rmse:0.03830
[80]	validation_0-rmse:0.03561
[90]	validation_0-rmse:0.03339
[99]	validation_0-rmse:0.03206
Mean Squared Error: 0.006703459192067385
R-squared: 0.9731661811398014
Fold : 2
[0]	validation_0-rmse:0.36865
[10]	validation_0-rmse:0.07420
[20]	validation_0-rmse:0.06040
[30]	validation_0-rmse:0.05303
[40]	validat

{'BRD4': (XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric='rmse', feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  'rmse : 0.9810802068031322'),
 'HSA': (XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_

In [45]:
if CFG.is_output:
    with open(CFG.output_path, 'wb') as f:
        pickle.dump(best_models, f)