# IRAK4 IC50 prediction using MLP Regression 

### Import

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=99)
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression
# import xgboost
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [3]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [5]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    # 먼저 SMILES 데이터로 부터 구조식 데이터를 뽑고,
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # 구조식으로부터 2048개의 0과 1로 이루어진 분자지문 데이터를 객체를 얻는다. 
        # 이 때, 1은 어떤 분자구조의 존재를 의미하며 0과 1의 개수는 nBits에 따라 달라진다. 
        # bitInfo 에 dictionary를 할당하면 어느 인덱스에 지문이 찍혔는지 알 수 있으며,
        # radius는 단일 맵핑 특성 생성에 사용되는 원자들의 반경,
        # nBits는 분자구조 맵핑을 위한 경로들의 고유조합수를 뜻한다.
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=CFG['NBITS'])
        # 객체를 배열로 변환해서 리턴
        return np.array(fp)
    else:
        # 구조식이 데이터가 없는 경우 2048개의 0을 리턴
        return np.zeros((CFG['NBITS'],))

In [6]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [7]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

# 훈련데이터 X는 위 함수에서 얻은 분자지문 값, Y는 각각에 상응하는 pIC50 값으로 한다
train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Train & Validation

In [232]:
for i in range(1, 400):
    print(pow(i, 0.3))

1.0
1.2311444133449163
1.3903891703159093
1.515716566510398
1.6206565966927624
1.711769859409705
1.792789962520997
1.8660659830736148
1.9331820449317627
1.9952623149688795
2.053136413658844
2.107435899344471
2.1586538444215795
2.2071833466585673
2.2533433808426553
2.2973967099940698
2.3395626336814512
2.3800262745964407
2.4189454814875875
2.4564560522315806
2.4926757485402593
2.5277074255111027
2.5616415021458128
2.594557933960465
2.626527804403767
2.657614620905154
2.6878753795222865
2.717361446466631
2.7461192933624785
2.774191114672181
2.8016153494371836
2.82842712474619
2.854658634732502
2.8803394661274377
2.9054968792442244
2.930156051583521
2.954340289925338
2.978071215719372
3.001368927744568
3.0242521453322184
3.046738334899583
3.068843822095698
3.0905838914967028
3.1119728754884552
3.1330242337266654
3.1537506243592963
3.174163968024659
3.194275505495155
3.2140958497160383
3.233635032886787
3.2529025491464565
3.2719073933511478
3.2906580963691905
3.3091627572662095
3.327429072

In [269]:
model = MLPRegressor(hidden_layer_sizes=(500,), max_iter=10000, verbose=True,
                     activation='relu', solver='adam', batch_size='auto', alpha=1,
                     learning_rate_init=0.0001, learning_rate='adaptive', power_t=0.3,
                     early_stopping=False, validation_fraction=0.1, tol=1e-4, n_iter_no_change=20)
model.fit(train_x, train_y)
# optimizer, early-stopping     learning_rate 적용기법 및 초기기울기      train   val
# sgd, early_stopping=True      constant lr_init=0.001                  0.771, 0.678
#                               invscaling lr_init=0.005 power_t=3      0.749, 0.667 
#                               adaptive lr_init=0.001                  0.802, 0.684
# adam, early_stopping=True     0.0001                                  0.852, 0.675
# adam, early_stopping=False    0.0001                                  0.973, 0.693
# sgd,  early_stopping=False    adaptive 0.001                          0.972, 0.692                                    

Iteration 1, loss = 30.21118109
Iteration 2, loss = 25.95725338
Iteration 3, loss = 22.15580247
Iteration 4, loss = 18.70373609
Iteration 5, loss = 15.57478840
Iteration 6, loss = 12.72006052
Iteration 7, loss = 10.19262309
Iteration 8, loss = 8.00210811
Iteration 9, loss = 6.20674378
Iteration 10, loss = 4.80926720
Iteration 11, loss = 3.79256462
Iteration 12, loss = 3.09613180
Iteration 13, loss = 2.67805288
Iteration 14, loss = 2.43118647
Iteration 15, loss = 2.29646767
Iteration 16, loss = 2.21731773
Iteration 17, loss = 2.16038022
Iteration 18, loss = 2.11041811
Iteration 19, loss = 2.06417503
Iteration 20, loss = 2.01968321
Iteration 21, loss = 1.97694074
Iteration 22, loss = 1.93697273
Iteration 23, loss = 1.89943245
Iteration 24, loss = 1.86304753
Iteration 25, loss = 1.82910969
Iteration 26, loss = 1.79610513
Iteration 27, loss = 1.76373201
Iteration 28, loss = 1.73354654
Iteration 29, loss = 1.70386316
Iteration 30, loss = 1.67554314
Iteration 31, loss = 1.64804334
Iteration 

MLPRegressor(alpha=1, hidden_layer_sizes=(500,), learning_rate='adaptive',
             learning_rate_init=0.0001, max_iter=10000, n_iter_no_change=20,
             power_t=0.3, verbose=True)

In [270]:
# train 데이터로부터의 학습 모델 평가

# pIC50 값 예측
train_y_pred = model.predict(train_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(train_y), pIC50_to_IC50(train_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# # 훈련데이터와 그 예측값(pIC50)과의 상관관계(결정계수) 
print("r2_score:", r2_score(train_y, train_y_pred))


RMSE: 1202.2482909861114
r2_score: 0.9728480752952361


In [271]:
# Validation 데이터로부터의 학습 모델 평가

# pIC50 값 예측
val_y_pred = model.predict(val_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')

# val 데이터로 결정계수 확인
print("r2_score:", r2_score(val_y, val_y_pred))

RMSE: 2145.8979177036326
r2_score: 0.6928077097057763


### Inference

In [30]:
# test 데이터로부터의 학습 모델 평가
test = pd.read_csv('./test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

# pIC50 값 예측
test_y_pred = model.predict(test_x)

np.set_printoptions(threshold=np.inf, linewidth=np.inf)
pIC50_to_IC50(test_y_pred)[:10]

array([181.96170602,  31.64219967,  10.78052721,  21.3766674 ,  25.31278906,   9.00616709,  26.86828832,  25.04378692,  35.17156285, 180.01139604])

### Submission

In [31]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,181.961706
1,TEST_001,31.6422
2,TEST_002,10.780527
3,TEST_003,21.376667
4,TEST_004,25.312789


In [32]:
submit.to_csv('./baseline_submit.csv', index=False)