### Import

In [1]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [4]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    # 먼저 SMILES 데이터로 부터 구조식 데이터를 뽑고,
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        # 구조식으로부터 2048개의 0과 1로 이루어진 분자지문 데이터를 객체를 얻는다. 
        # 이 때, 1은 어떤 분자구조의 존재를 의미하며 0과 1의 개수는 nBits에 따라 달라진다. 
        # bitInfo 에 dictionary를 할당하면 어느 인덱스에 지문이 찍혔는지 알 수 있으며,
        # radius는 단일 맵핑 특성 생성에 사용되는 원자들의 반경,
        # nBits는 분자구조 맵핑을 위한 경로들의 고유조합수를 뜻한다.
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=CFG['NBITS'])
        # 객체를 배열로 변환해서 리턴
        return np.array(fp)
    else:
        # 구조식이 데이터가 없는 경우 2048개의 0을 리턴
        return np.zeros((CFG['NBITS'],))

In [5]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
chembl_data.head()

Unnamed: 0,Molecule ChEMBL ID,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Assay ChEMBL ID,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,IC50_nM,pIC50,Smiles
0,CHEMBL4443947,IC50,'=',0.022,nM,10.66,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.022,10.66,CN[C@@H](C)C(=O)N[C@H](C(=O)N1C[C@@H](NC(=O)CC...
1,CHEMBL4556091,IC50,'=',0.026,nM,10.59,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.026,10.59,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
2,CHEMBL4566431,IC50,'=',0.078,nM,10.11,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.078,10.11,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
3,CHEMBL4545898,IC50,'=',0.081,nM,10.09,CHEMBL4345131,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4342485,0.081,10.09,CC(C)(O)[C@H](F)CN1Cc2cc(NC(=O)c3cnn4cccnc34)c...
4,CHEMBL4448950,IC50,'=',0.099,nM,10.0,CHEMBL4361896,CHEMBL3778,Interleukin-1 receptor-associated kinase 4,Homo sapiens,SINGLE PROTEIN,CHEMBL4359855,0.099,10.0,COc1cc2c(OC[C@@H]3CCC(=O)N3)ncc(C#CCCCCCCCCCCC...


### Data Pre-processing

In [6]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

# 훈련데이터 X는 위 함수에서 얻은 분자지문 값, Y는 각각에 상응하는 pIC50 값으로 한다
train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Train & Validation

In [7]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

RandomForestRegressor(random_state=42)

In [None]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

In [9]:
# train 데이터로부터의 학습 모델 평가

# pIC50 값 예측
train_y_pred = model.predict(train_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(train_y), pIC50_to_IC50(train_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 1988.0913536648682


In [23]:
for i in range(10):
    print(pIC50_to_IC50(train_y[i]), pIC50_to_IC50(train_y_pred[i]))

10.964781961431852 8.574326166361718
25118.864315095823 9120.108393558985
16.98243652461746 10.006910141682656
776.2471166286912 870.6628217330684
549.5408738576248 386.81205463305645
8.128305161640995 7.138385536016661
138.03842646028838 162.01926835835351
1698.243652461746 948.6368698664769
4.073802778041122 6.19726403928236
72.44359600749905 173.67607671948562


In [21]:
# # 훈련데이터와 그 예측값(pIC50)과의 상관관계(결정계수) 
r2_score(train_y, train_y_pred)

0.9512988797129875

In [22]:
# 훈련데이터와 그 예측값을 IC50으로 변환한 값과의 상관관계
r2_score(train_y, pIC50_to_IC50(train_y_pred))

-900742.5423154405

In [8]:
# Validation 데이터로부터의 학습 모델 평가

# pIC50 값 예측
val_y_pred = model.predict(val_x)
# 정답 IC50 값과 예측된 IC50 값을 비교하여 평균 제곱 오차값 산출 
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# 평균 제곱 오차값을 평균제곱근 오차값으로 변환
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 2169.5781089857264


In [14]:
for i in range(10):
    print(pIC50_to_IC50(val_y[i]), pIC50_to_IC50(val_y_pred[i]))

537.0317963702532 66.9789497413166
151.3561248436207 271.606400480799
6456.54229034655 1868.529677255851
446.6835921509635 341.66461281382243
2.691534803926914 4.434044016334473
3.6307805477010175 3.5571321513458205
213.79620895022325 97.22993602579264
5.248074602497733 3.0563282803658725
22.90867652767775 15.319573219471161
2.691534803926914 7.38584178120802
3.019951720402019 4.058790794129028
524.8074602497722 216.88690598094155
169.8243652461746 284.11881849832326
10.0 21.31571966716918
3715.3522909717276 1852.2516912529723
112.2018454301963 6.660395212984137
1.0 4.322947885417576
5.888436553555884 2.114300276881429
3.6307805477010175 9.637365572954586
1.584893192461111 3.755490490948153
1412.5375446227554 505.47537135979894
1.2882495516931323 2.6193890037214715
19.952623149688787 20.606299132699796
7.079457843841373 4.000368510461267
15.848931924611142 48.46185122401814
19.054607179632484 35.15604405282955
2.1877616239495516 2.693394692610774
3.019951720402019 7.698391619294387
223

In [17]:
# val 데이터로 결정계수 확인
r2_score(val_y, val_y_pred)

0.7001332789691126

### Inference

In [30]:
# test 데이터로부터의 학습 모델 평가
test = pd.read_csv('./test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

# pIC50 값 예측
test_y_pred = model.predict(test_x)

np.set_printoptions(threshold=np.inf, linewidth=np.inf)
pIC50_to_IC50(test_y_pred)[:10]

array([181.96170602,  31.64219967,  10.78052721,  21.3766674 ,  25.31278906,   9.00616709,  26.86828832,  25.04378692,  35.17156285, 180.01139604])

### Submission

In [31]:
submit = pd.read_csv('./sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,181.961706
1,TEST_001,31.6422
2,TEST_002,10.780527
3,TEST_003,21.376667
4,TEST_004,25.312789


In [32]:
submit.to_csv('./baseline_submit.csv', index=False)