# IRAK4 IC50 prediction using Linear Regression 

### Import

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.inf, linewidth=99)
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [75]:
CFG = {
    'NBITS':8192,
    'SEED':42,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [76]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    # 먼저 SMILES 데이터로 부터 구조식 데이터를 뽑고,
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # 구조식으로부터 2048개의 0과 1로 이루어진 분자지문 데이터를 객체를 얻는다. 
        # 이 때, 1은 어떤 분자구조의 존재를 의미하며 0과 1의 개수는 nBits에 따라 달라진다. 
        # bitInfo 에 dictionary를 할당하면 어느 인덱스에 지문이 찍혔는지 알 수 있으며,
        # radius는 단일 맵핑 특성 생성에 사용되는 원자들의 반경,
        # nBits는 분자구조 맵핑을 위한 경로들의 고유조합수를 뜻한다.
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=8, nBits=CFG['NBITS'])
        # 객체를 배열로 변환해서 리턴
        return np.array(fp)
    else:
        # 구조식이 데이터가 없는 경우 2048개의 0을 리턴
        return np.zeros((CFG['NBITS'],))

In [5]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [90]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

# 훈련데이터 X는 위 함수에서 얻은 분자지문 값, Y는 각각에 상응하는 pIC50 값으로 한다
train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Train & Validation

In [91]:
# 선형회귀로 모델 학습
model = LinearRegression(fit_intercept=True)
model.fit(train_x, train_y)

LinearRegression()

In [93]:
# 선형회귀로 모델 학습
model = LinearRegression(fit_intercept=False)
model.fit(train_x, train_y)

LinearRegression(fit_intercept=False)

In [94]:
# train 데이터로부터의 학습 모델 평가

# pIC50 값 예측
train_y_pred = model.predict(train_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(train_y), pIC50_to_IC50(train_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 629.0151349359294


In [95]:
for i in range(10):
    print(pIC50_to_IC50(train_y[i]), pIC50_to_IC50(train_y_pred[i]))

4.677351412871981 4.680455127613524
645.6542290346549 715.862789173057
158.48931924611142 149.1705759342722
13.182567385564074 19.15793223778833
8317.63771102671 6115.584739617485
4897.788193684466 24470.416278862103
1202.2644346174131 1204.1718669183322
3.3113112148259076 2.815097286774781
43.65158322401656 44.61126929155627
1.9952623149688828 11.438791051413453


In [96]:
# # 훈련데이터와 그 예측값(pIC50)과의 상관관계(결정계수) 
r2_score(train_y, train_y_pred)

0.9886833432429178

In [97]:
val_y_pred = model.predict(val_x)
val_y_pred

array([ 7.24827576e+00, -8.42743817e+09,  8.45227682e+10,  1.48821059e+10,  5.51169563e+10,
        2.56748031e+10, -3.69424273e+10, -1.56638935e+11, -1.68140102e+10,  2.01368275e+10,
       -2.58205013e+10,  5.49189595e+10,  8.40505678e+10,  2.66993555e+10,  3.94692412e+10,
       -2.48400318e+09, -3.99434708e+10,  1.27214636e+11, -1.67993680e+10,  8.43876598e+10,
       -4.38957364e+11, -9.47499212e+07,  8.43838490e+10,  2.25332418e+09, -7.88722773e+10,
       -7.26796588e+10, -1.81473340e+11, -3.88901363e+10,  1.56099655e+10,  1.79063535e+10,
       -4.48637417e+10,  8.70947266e+00, -2.98863665e+11,  3.83951456e+09, -6.46716913e+10,
        8.59851074e+00, -8.59054790e+10,  7.07769120e+10,  4.89372445e+10, -1.82443085e+10,
        9.86353444e+10, -2.92887980e+10,  2.86794006e+09, -1.40747669e+10,  5.27272048e+11,
        4.14498658e+11, -1.05887696e+11,  1.63545908e+11, -2.19100038e+10,  1.02075733e+11,
        2.64167277e+10,  1.60215392e+10,  1.89989015e+11,  1.88224962e+11,  2.69

In [98]:
# Validation 데이터로부터의 학습 모델 평가

# pIC50 값 예측
val_y_pred = model.predict(val_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

  This is separate from the ipykernel package so we can avoid doing imports until


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [99]:
for i in range(10):
    print(pIC50_to_IC50(val_y[i]), pIC50_to_IC50(val_y_pred[i]))

537.0317963702532 56.45783799154333
151.3561248436207 inf
6456.54229034655 0.0
446.6835921509635 0.0
2.691534803926914 0.0
3.6307805477010175 0.0
213.79620895022325 inf
5.248074602497733 inf
22.90867652767775 inf
2.691534803926914 0.0


  This is separate from the ipykernel package so we can avoid doing imports until


In [100]:
# val 데이터로 결정계수 확인
r2_score(val_y, val_y_pred)

-1.2362479716062833e+22

### Inference

In [30]:
# test 데이터로부터의 학습 모델 평가
test = pd.read_csv('./test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

# pIC50 값 예측
test_y_pred = model.predict(test_x)

np.set_printoptions(threshold=np.inf, linewidth=np.inf)
pIC50_to_IC50(test_y_pred)[:10]

array([181.96170602,  31.64219967,  10.78052721,  21.3766674 ,  25.31278906,   9.00616709,  26.86828832,  25.04378692,  35.17156285, 180.01139604])

### Submission

In [31]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,181.961706
1,TEST_001,31.6422
2,TEST_002,10.780527
3,TEST_003,21.376667
4,TEST_004,25.312789


In [32]:
submit.to_csv('./baseline_submit.csv', index=False)