In [1]:
!pip install deepchem

Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9 (from deepchem)
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit (from deepchem)
  Downloading rdkit-2023.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, rdkit, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.11.2
    Uninstalling scipy-1.11.2:
      Successfully uninstalled scipy-1.11.2
Successfully installed deepchem-2.7.1 rdkit-2023.3.3 scipy-1.8.1


In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import warnings
%config InlineBackend.figure_format = 'retina'
warnings.filterwarnings("ignore")
%matplotlib inline

from tqdm import tqdm
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit import DataStructs
import tensorflow as tf

import deepchem as dc
from deepchem.feat.mol_graphs import ConvMol
from deepchem.models.layers import GraphConv, GraphPool, GraphGather
from deepchem.models.graph_models import GraphConvModel
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

from deepchem.metrics import to_one_hot
from deepchem.utils.data_utils import load_from_disk



In [5]:
sns.set_style('whitegrid')

In [6]:
DATA_PATH = "/content/drive/MyDrive/project/data/"
SEED = 42
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

# FPs 데이터 불러오기

In [None]:
train_df = pd.read_csv(f'{DATA_PATH}train_fps_mol.csv')
test_df = pd.read_csv(f'{DATA_PATH}test_fps_mol.csv')
test_df

Unnamed: 0,id,SMILES,AlogP,Molecular_Weight,Num_H_Acceptors,Num_H_Donors,Num_RotatableBonds,LogD,Molecular_PolarSurfaceArea,fps,mol
0,TEST_000,CC(C)Nc1ccnc(N2CCN(Cc3cccs3)C(CCO)C2)n1,2.641,361.505,4,2,7,2.635,92.76,[0. 1. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b89f1f50>
1,TEST_001,COc1cc(=O)n(-c2ccccc2)cc1C(=O)N1CCC2(CC1)OCCO2,0.585,370.399,5,0,3,0.585,68.31,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b89f1fc0>
2,TEST_002,Cc1cccc(NC(=N)/N=c2\nc(O)c(Cc3ccccc3)c(C)[nH]2)c1,4.276,347.414,4,4,5,4.290,92.86,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b89f1d20>
3,TEST_003,O=C(c1nc2ncccn2n1)N1CCCn2cc(-c3ccccc3)nc21,1.795,345.358,5,0,2,1.795,81.21,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b89f2030>
4,TEST_004,CCN1CCN(C(=O)c2cc3c(=O)n4cc(C)ccc4nc3n2C)CC1,1.219,353.418,4,0,2,0.169,61.15,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b89f1cb0>
...,...,...,...,...,...,...,...,...,...,...,...
478,TEST_478,CCc1noc(CC)c1CC(=O)NCC1(CC)CCCCC1,4.207,306.443,2,1,7,4.207,55.13,[0. 0. 1. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b88174c0>
479,TEST_479,CC(=O)N1CCC2(CC1)OC(=O)C(C)=C2C(=O)N1CCN(C)CC1,-0.608,335.398,5,0,1,-1.736,70.16,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b8817530>
480,TEST_480,CC(C)NC(=O)CN1C(=O)c2ccccc2N2C(=O)c3ccccc3C12,1.792,349.383,3,1,3,1.792,69.72,[0. 1. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b88175a0>
481,TEST_481,Cn1cc(Br)c(=O)c(NC(=O)c2ccc(O)cc2F)c1,0.790,341.132,3,2,2,0.423,69.64,[0. 0. 0. ... 0. 0. 0.],<rdkit.Chem.rdchem.Mol object at 0x7999b8817610>


In [None]:
train_df.isnull().sum().sum() , test_df.isnull().sum().sum()

(0, 0)

In [None]:
mlm_column = ['SMILES', 'MLM']
hlm_column = ['SMILES', 'HLM']

In [None]:
mlm_df = train_df[mlm_column]
hlm_df = train_df[hlm_column]

In [None]:
mlm_df.to_csv(f'{DATA_PATH}mlm_df.csv', index=False, encoding="utf-8-sig")
hlm_df.to_csv(f'{DATA_PATH}hlm_df.csv', index=False, encoding="utf-8-sig")

In [None]:
# mlm_df.to_csv(path + "mlm_df.csv", index=False, encoding="utf-8-sig")
# hlm_df.to_csv(path + "hlm_df.csv", index=False, encoding="utf-8-sig")

In [None]:
featurizer = dc.feat.ConvMolFeaturizer()
splitter = dc.splits.RandomSplitter()

# MLM

In [None]:
loader_m = dc.data.CSVLoader(tasks=["MLM"], feature_field="SMILES",featurizer=featurizer)

In [None]:
dataset_m = loader_m.create_dataset(f'{DATA_PATH}mlm_df.csv')
dataset_m

<DiskDataset X.shape: (3486,), y.shape: (3486, 1), w.shape: (3486, 1), task_names: ['MLM']>

In [None]:
train_dataset_m, valid_dataset_m, test_dataset_m = splitter.train_valid_test_split(dataset_m)

In [None]:
model_m = GraphConvModel(n_tasks=1, mode='regression', batch_size=50, dropout=0.3, batch_normalize=True,
                       model_dir="./mlm") # , random_seed=0

In [None]:
losses_m = []

for epoch in tqdm(range(1500)):
    loss = model_m.fit(train_dataset_m, nb_epoch=1)  # 1 epoch씩 학습
    losses_m.append(loss)

100%|██████████| 1500/1500 [39:30<00:00,  1.58s/it]


In [None]:
valid_m_evaluation = model_m.evaluate(valid_dataset_m, [dc.metrics.Metric(dc.metrics.mean_squared_error, mode="regression")])

In [None]:
valid_m_evaluation

{'mean_squared_error': 1225.0469368676036}

In [None]:
print("RMSE :", np.sqrt(valid_m_evaluation['mean_squared_error']))

RMSE : 35.00067052025723


In [None]:
def show_reg_result(y_test, y_pred, N=50):
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    R2 = r2_score(y_test, y_pred)
    max_err = np.abs(y_test - y_pred).max()

    print('R2:', round(R2,4))
    print('MAE:', round(mae, 4))
    print('RMSE:', round(rmse,4))
    print('Max error:', round(max_err, 4))

In [None]:
y_pred = model_m.predict(test_dataset_m)
show_reg_result(test_dataset_m.y, y_pred)

R2: -0.1008
MAE: 28.5176
RMSE: 38.2942
Max error: 121.385


- h 에도 적용

In [None]:
loader_h = dc.data.CSVLoader(tasks=["HLM"], feature_field="SMILES",featurizer=featurizer)
dataset_h = loader_h.create_dataset(f'{DATA_PATH}hlm_df.csv')
train_dataset_h, valid_dataset_h, test_dataset_h = splitter.train_valid_test_split(dataset_h)

In [None]:
model_h = GraphConvModel(n_tasks=1, mode='regression', batch_size=50,
                       model_dir="./hlm") # , random_seed=0

In [None]:
losses_h = []

for epoch in tqdm(range(1500)):
    loss = model_h.fit(train_dataset_h, nb_epoch=1)  # 1 epoch씩 학습
    losses_h.append(loss)

100%|██████████| 1500/1500 [38:13<00:00,  1.53s/it]


In [None]:
valid_h_evaluation = model_h.evaluate(valid_dataset_h, [dc.metrics.Metric(dc.metrics.mean_squared_error, mode="regression")])
valid_h_evaluation

{'mean_squared_error': 1427.533377104419}

In [None]:
print("RMSE :", np.sqrt(valid_h_evaluation['mean_squared_error']))

RMSE : 37.7827126753019


In [None]:
y_pred = model_h.predict(test_dataset_h)
show_reg_result(test_dataset_h.y, y_pred)

R2: 0.0699
MAE: 28.0601
RMSE: 34.7629
Max error: 99.5163


In [None]:
test_dataset_h

<DiskDataset X.shape: (349,), y.shape: (349, 1), w.shape: (349, 1), ids: ['O=C1N=C(O)N=C(O)C1C1c2ccccc2Oc2nc(O)nc(O)c21'
 'Cc1c(CN2C[C@H]3CC[C@@H](C2)NC3)oc2ccccc12'
 'CC(=O)N(c1ccccc1)C(c1ccccc1)c1nc2ccccc2nc1O' ...
 'Cc1cc(C)c2c(c1)C(NC(=O)c1ncoc1C1CCCO1)CCO2'
 'c1ccc(-c2ccc(-c3cn4c5ccccc5nc4n3CCN3CCCCC3)cc2)cc1'
 'Cc1nc(C(=O)NC(C)C(C)(C)C)c2n1CCCC2'], task_names: ['HLM']>

# Test df로 예측 값 얻기

In [None]:
test_df.columns

Index(['id', 'SMILES', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors',
       'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'fps', 'mol'],
      dtype='object')

In [None]:
test_features = featurizer.featurize(test_df['SMILES'])

In [None]:
test_dataset_h

<DiskDataset X.shape: (349,), y.shape: (349, 1), w.shape: (349, 1), ids: ['O=C1N=C(O)N=C(O)C1C1c2ccccc2Oc2nc(O)nc(O)c21'
 'Cc1c(CN2C[C@H]3CC[C@@H](C2)NC3)oc2ccccc12'
 'CC(=O)N(c1ccccc1)C(c1ccccc1)c1nc2ccccc2nc1O' ...
 'Cc1cc(C)c2c(c1)C(NC(=O)c1ncoc1C1CCCO1)CCO2'
 'c1ccc(-c2ccc(-c3cn4c5ccccc5nc4n3CCN3CCCCC3)cc2)cc1'
 'Cc1nc(C(=O)NC(C)C(C)(C)C)c2n1CCCC2'], task_names: ['HLM']>

In [None]:
test_set = dc.data.NumpyDataset(test_features)
test_set

<NumpyDataset X.shape: (483,), y.shape: (483, 1), w.shape: (483, 1), ids: [0 1 2 ... 480 481 482], task_names: [0]>

In [None]:
model_h.restore()
model_m.restore()

In [None]:
mlm_pred = model_m.predict(test_set)
hlm_pred = model_h.predict(test_set)

In [None]:
test_df.columns

Index(['id', 'SMILES', 'AlogP', 'Molecular_Weight', 'Num_H_Acceptors',
       'Num_H_Donors', 'Num_RotatableBonds', 'LogD',
       'Molecular_PolarSurfaceArea', 'fps', 'mol'],
      dtype='object')

# 제출

In [7]:
submission = pd.read_csv(f"{DATA_PATH}sample_submission.csv")
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,0,0
1,TEST_001,0,0
2,TEST_002,0,0
3,TEST_003,0,0
4,TEST_004,0,0
...,...,...,...
478,TEST_478,0,0
479,TEST_479,0,0
480,TEST_480,0,0
481,TEST_481,0,0


In [None]:
len(submission['MLM']) , len(mlm_pred) , len(hlm_pred)

(483, 483, 483)

In [None]:
submission['HLM'] = hlm_pred
submission["MLM"] = mlm_pred

In [None]:
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,14.927991,46.035515
1,TEST_001,60.989841,66.637039
2,TEST_002,69.178925,75.743164
3,TEST_003,18.849379,28.377901
4,TEST_004,71.641701,61.353733
...,...,...,...
478,TEST_478,0.670486,28.354101
479,TEST_479,76.926971,70.899216
480,TEST_480,23.379583,44.759228
481,TEST_481,-4.392597,43.242184


In [None]:
submission.to_csv(f"{DATA_PATH}dl_sub_2.csv", index=False)

In [None]:
submission.to_csv("dl_sub_2.csv", index=False)

## ml + dl(fp) + dl(gcn) 앙상블^^

In [7]:
ml = pd.read_csv(f"{DATA_PATH}ml_sub_1.csv")
dl_fp = pd.read_csv(f"{DATA_PATH}dl_sub_1.csv")
dl_gcn = pd.read_csv(f"{DATA_PATH}dl_sub_2.csv")

In [10]:
submission['MLM'] = (ml['MLM'] + dl_fp['MLM'] + dl_gcn['MLM']) / 3
submission['HLM'] = (ml['HLM'] + dl_fp['HLM'] + dl_gcn['HLM']) / 3
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,17.142199,43.767036
1,TEST_001,63.737011,78.834957
2,TEST_002,51.755102,49.006690
3,TEST_003,33.164269,48.630055
4,TEST_004,61.153022,71.050074
...,...,...,...
478,TEST_478,21.125220,46.324422
479,TEST_479,76.849845,79.408265
480,TEST_480,29.439063,52.679173
481,TEST_481,27.480459,53.146905


In [11]:
submission.to_csv(f"{DATA_PATH}ensemble_sub_2.csv", index=False)

In [12]:
submission.to_csv("ensemble_sub_2.csv", index=False)

## ml + dl(gcn) 앙상블^^

In [8]:
ml = pd.read_csv(f"{DATA_PATH}ml_sub_1.csv")
dl_gcn = pd.read_csv(f"{DATA_PATH}dl_sub_2.csv")

In [9]:
submission['MLM'] = (ml['MLM'] + dl_gcn['MLM']) / 2
submission['HLM'] = (ml['HLM'] + dl_gcn['HLM']) / 2
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,22.155626,45.285369
1,TEST_001,57.527684,72.159159
2,TEST_002,50.767822,59.986973
3,TEST_003,37.637232,50.764082
4,TEST_004,69.536586,68.578567
...,...,...,...
478,TEST_478,5.710756,26.711074
479,TEST_479,75.249400,77.274710
480,TEST_480,32.765815,56.442234
481,TEST_481,28.996720,54.906366


In [10]:
submission.to_csv(f"{DATA_PATH}ensemble_sub_3.csv", index=False)

In [11]:
submission.to_csv("ensemble_sub_3.csv", index=False)

## dl(fp) + dl(gcn) 앙상블^^

In [12]:
dl_fp = pd.read_csv(f"{DATA_PATH}dl_sub_1.csv")
dl_gcn = pd.read_csv(f"{DATA_PATH}dl_sub_2.csv")

In [13]:
submission['MLM'] = (dl_fp['MLM'] + dl_gcn['MLM']) / 2
submission['HLM'] = (dl_fp['HLM'] + dl_gcn['HLM']) / 2
submission

Unnamed: 0,id,MLM,HLM
0,TEST_000,11.021668,43.382942
1,TEST_001,68.572751,79.411797
2,TEST_002,61.454293,51.394644
3,TEST_003,21.533861,36.369950
4,TEST_004,58.013797,68.673410
...,...,...,...
478,TEST_478,26.312316,56.952609
479,TEST_479,78.488853,77.287296
480,TEST_480,23.082571,44.956141
481,TEST_481,10.027670,46.435084


In [14]:
submission.to_csv(f"{DATA_PATH}ensemble_sub_4.csv", index=False)

In [15]:
submission.to_csv("ensemble_sub_4.csv", index=False)