In [4]:
import pandas as pd
import sqlalchemy

# MySQL 데이터베이스 연결
engine = sqlalchemy.create_engine('mysql+pymysql://root:123@127.0.0.1:3307/my_database')

In [5]:
query = """
SELECT * FROM train;
"""
train_df = pd.read_sql(query, engine)

In [6]:
query = """
SELECT * FROM test;
"""
test_df = pd.read_sql(query, engine)

In [7]:
query = """
SELECT * FROM train_processed;
"""
train_processed = pd.read_sql(query, engine)

In [8]:
query = """
SELECT * FROM test_processed;
"""
test_processed = pd.read_sql(query, engine)

In [9]:
train_processed.columns

Index(['癤풫olecule_ChEMBL_ID', 'Standard_Type', 'Standard_Relation',
       'Standard_Value', 'Standard_Units', 'pChEMBL_Value', 'Assay_ChEMBL_ID',
       'Target_ChEMBL_ID', 'Target_Name', 'Target_Organism', 'Target_Type',
       'Document_ChEMBL_ID', 'IC50_nM', 'pIC50', 'Smiles', 'Atom_Info',
       'Bond_Info', '3D_Conformer', 'MolWt', 'MolLogP', 'NumHDonors',
       'NumHAcceptors', 'TPSA', 'Atom_Mean', 'Atom_Max', 'Atom_Min',
       'Atom_Len', 'Bond_Mean', 'Bond_Max', 'Bond_Min', 'Bond_Len',
       'Conformer_Mean', 'Conformer_Max', 'Conformer_Min', 'Conformer_Len'],
      dtype='object')

In [10]:
test_processed.columns

Index(['ID', 'Smiles', 'Atom_Info', 'Bond_Info', '3D_Conformer', 'MolWt',
       'MolLogP', 'NumHDonors', 'NumHAcceptors', 'TPSA', 'Atom_Mean',
       'Atom_Max', 'Atom_Min', 'Atom_Len', 'Bond_Mean', 'Bond_Max', 'Bond_Min',
       'Bond_Len', 'Conformer_Mean', 'Conformer_Max', 'Conformer_Min',
       'Conformer_Len'],
      dtype='object')

### Hugging Face에서 ChemBERTa 모델을 불러와서 사용

In [11]:
from rdkit import Chem
from sklearn.model_selection import train_test_split

In [12]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

In [13]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# ChemBERTa의 사전 학습된 토크나이저와 모델을 로드
tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaForSequenceClassification.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", num_labels=1)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# SMILES 데이터를 리스트로 준비
smiles_list = train_df['Smiles'].tolist()
labels = train_df['IC50_nM'].values  # 타겟 변수

In [38]:
from sklearn.model_selection import train_test_split

# 데이터 분할
smiles_train, smiles_val, labels_train, labels_val = train_test_split(smiles_list, labels, test_size=0.2, random_state=42)

In [39]:
# 훈련 데이터 토큰화 및 인코딩
train_inputs = tokenizer(smiles_train, padding=True, truncation=True, max_length=512, return_tensors="pt")

# 검증 데이터 토큰화 및 인코딩
val_inputs = tokenizer(smiles_val, padding=True, truncation=True, max_length=512, return_tensors="pt")

In [40]:
import torch

# 입력 데이터와 타겟 변수를 Dataset으로 변환
class SMILESDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs["input_ids"][idx]
        attention_mask = self.inputs["attention_mask"][idx]
        label = self.labels[idx]
        return {"input_ids": input_ids, "attention_mask": attention_mask, "label": label}

# 훈련 및 검증 데이터셋 생성
train_dataset = SMILESDataset(train_inputs, labels_train)
val_dataset = SMILESDataset(val_inputs, labels_val)

In [41]:
import os

# 디렉터리 생성
os.makedirs('./results', exist_ok=True)
os.makedirs('./logs', exist_ok=True)

In [42]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/absolute/path/to/results",  # 절대 경로 사용
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="/absolute/path/to/logs",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [43]:
# Trainer 설정 및 학습
trainer = Trainer(
    model=model,                 # 학습시킬 모델
    args=training_args,          # 학습 인자 설정
    train_dataset=dataset,       # 훈련 데이터셋
)

In [44]:
# W&B 비활성화
os.environ["WANDB_DISABLED"] = "true"

# 모델 학습
trainer.train()

  0%|          | 0/366 [00:00<?, ?it/s]

{'train_runtime': 1318.3915, 'train_samples_per_second': 4.442, 'train_steps_per_second': 0.278, 'train_loss': 7355827.060109289, 'epoch': 3.0}


TrainOutput(global_step=366, training_loss=7355827.060109289, metrics={'train_runtime': 1318.3915, 'train_samples_per_second': 4.442, 'train_steps_per_second': 0.278, 'train_loss': 7355827.060109289, 'epoch': 3.0})

In [45]:
# 검증 데이터에 대한 예측
outputs = trainer.predict(val_dataset)

  0%|          | 0/25 [00:00<?, ?it/s]

In [49]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# 성능 평가
predictions = outputs.predictions.squeeze()
labels = outputs.label_ids
rmse = np.sqrt(mean_squared_error(labels, predictions))
r2 = r2_score(labels, predictions)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

RMSE: 2160.0966796875
R²: -0.08873001355895127


### GCN Model

In [52]:
def mol_to_graph_enhanced(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # 노드 (원자) 특성 추출
    atom_features = []
    for atom in mol.GetAtoms():
        features = []
        # 원자 번호
        features.append(atom.GetAtomicNum())
        # 원자의 화학적 환경 (예: 수소 결합 공여자 수, 수용자 수)
        features.append(atom.GetTotalNumHs())
        features.append(atom.GetTotalNumHs())
        # 기타 원자 특성 추가 가능
        atom_features.append(features)
    
    x = torch.tensor(atom_features, dtype=torch.float)  # [num_atoms, num_features]
    
    # 엣지 (결합) 정보 추출
    edge_index = []
    edge_attr = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.append([i, j])
        edge_index.append([j, i])  # 양방향
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()  # [2, num_edges]
    
    # 결합 유형 (예: 단일, 이중, 삼중, 아로마틱)
    bond_type = []
    bond_type_mapping = {
        Chem.BondType.SINGLE: 1,
        Chem.BondType.DOUBLE: 2,
        Chem.BondType.TRIPLE: 3,
        Chem.BondType.AROMATIC: 4
    }
    for bond in mol.GetBonds():
        bond_type.append(bond_type_mapping.get(bond.GetBondType(), 0))
        bond_type.append(bond_type_mapping.get(bond.GetBondType(), 0))
    
    edge_attr = torch.tensor(bond_type, dtype=torch.float).unsqueeze(1)  # [num_edges, 1]
    
    # 타겟 변수 (IC50)
    y = torch.tensor([train_df.loc[train_df['Smiles'] == smiles, 'IC50_nM'].values[0]], dtype=torch.float)
    
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
    return data

# 모든 SMILES 문자열을 그래프로 변환 (향상된 특성 사용)
graphs = [mol_to_graph_enhanced(smiles) for smiles in train_df['Smiles']]

# 유효한 그래프만 필터링
graphs = [graph for graph in graphs if graph is not None]

print(graphs[0])


Data(x=[72, 3], edge_index=[2, 156], edge_attr=[156, 1], y=[1])


In [53]:
from torch_geometric.loader import DataLoader

# 데이터 분할 (훈련/검증)
from sklearn.model_selection import train_test_split

train_graphs, val_graphs = train_test_split(graphs, test_size=0.2, random_state=42)

# DataLoader 생성
train_loader = DataLoader(train_graphs, batch_size=16, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=16, shuffle=False)

In [54]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool

class GCN(nn.Module):
    def __init__(self, num_node_features, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.lin = nn.Linear(hidden_channels, 1)  # 회귀 문제이므로 출력 차원 1

    def forward(self, x, edge_index, batch):
        # 첫 번째 GCN 레이어
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        # 두 번째 GCN 레이어
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.2, training=self.training)
        
        # 그래프 단위로 특징을 집계 (Global Mean Pooling)
        x = global_mean_pool(x, batch)  # [num_graphs, hidden_channels]
        
        # 선형 레이어
        x = self.lin(x)
        return x


In [55]:
# 노드 특성의 수가 3이라고 가정 (원자 번호, 수소 결합 공여자 수, 수용자 수)
num_node_features = train_graphs[0].x.shape[1]
hidden_channels = 64

# GCN 모델 초기화
model = GCN(num_node_features=num_node_features, hidden_channels=hidden_channels)

In [56]:
import torch.optim as optim

# 손실 함수 (회귀 문제이므로 MSELoss 사용)
criterion = nn.MSELoss()

# 옵티마이저 (Adam 사용)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, data.batch)
        loss = criterion(out, data.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(train_loader.dataset)

def evaluate(loader):
    model.eval()
    total_loss = 0
    preds = []
    trues = []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data.x, data.edge_index, data.batch)
            loss = criterion(out, data.y)
            total_loss += loss.item() * data.num_graphs
            preds.append(out.cpu())
            trues.append(data.y.cpu())
    preds = torch.cat(preds).numpy()
    trues = torch.cat(trues).numpy()
    rmse = np.sqrt(mean_squared_error(trues, preds))
    r2 = r2_score(trues, preds)
    return total_loss / len(loader.dataset), rmse, r2

# 학습 및 평가
num_epochs = 50
best_r2 = -np.inf

for epoch in range(1, num_epochs + 1):
    loss = train()
    val_loss, val_rmse, val_r2 = evaluate(val_loader)
    if val_r2 > best_r2:
        best_r2 = val_r2
        torch.save(model.state_dict(), 'best_model.pth')
    print(f'Epoch: {epoch:03d}, Train Loss: {loss:.4f}, Val Loss: {val_loss:.4f}, Val RMSE: {val_rmse:.4f}, Val R²: {val_r2:.4f}')


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 001, Train Loss: 8052501.2876, Val Loss: 4677384.9647, Val RMSE: 2162.7478, Val R²: -0.0914


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 002, Train Loss: 7973034.1835, Val Loss: 4528226.5360, Val RMSE: 2128.2148, Val R²: -0.0568


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 003, Train Loss: 7790097.5425, Val Loss: 4352454.5898, Val RMSE: 2086.9612, Val R²: -0.0163


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 004, Train Loss: 7678575.6992, Val Loss: 4301725.4527, Val RMSE: 2075.0259, Val R²: -0.0047


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 005, Train Loss: 7645386.1194, Val Loss: 4287216.4974, Val RMSE: 2071.6919, Val R²: -0.0014


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 006, Train Loss: 7641061.3274, Val Loss: 4285867.8107, Val RMSE: 2071.4092, Val R²: -0.0012


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 007, Train Loss: 7641738.4371, Val Loss: 4285455.0665, Val RMSE: 2071.3525, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 008, Train Loss: 7640379.9013, Val Loss: 4285693.3542, Val RMSE: 2071.4360, Val R²: -0.0012


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 009, Train Loss: 7640760.2151, Val Loss: 4285600.9514, Val RMSE: 2071.4011, Val R²: -0.0012


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 010, Train Loss: 7638992.5844, Val Loss: 4285511.1023, Val RMSE: 2071.3684, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 011, Train Loss: 7641475.3463, Val Loss: 4285711.9885, Val RMSE: 2071.4268, Val R²: -0.0012


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 012, Train Loss: 7638617.9401, Val Loss: 4285468.1841, Val RMSE: 2071.3223, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 013, Train Loss: 7639894.8180, Val Loss: 4285441.9246, Val RMSE: 2071.3235, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 014, Train Loss: 7638946.4491, Val Loss: 4285583.0153, Val RMSE: 2071.3762, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 015, Train Loss: 7639379.2412, Val Loss: 4285483.4974, Val RMSE: 2071.3171, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 016, Train Loss: 7638997.7718, Val Loss: 4285717.9923, Val RMSE: 2071.4177, Val R²: -0.0012


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 017, Train Loss: 7643121.5580, Val Loss: 4285625.3696, Val RMSE: 2071.3855, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 018, Train Loss: 7643376.6046, Val Loss: 4285559.9629, Val RMSE: 2071.3650, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 019, Train Loss: 7641346.3023, Val Loss: 4285438.3504, Val RMSE: 2071.3140, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 020, Train Loss: 7640658.7643, Val Loss: 4285459.3261, Val RMSE: 2071.3257, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 021, Train Loss: 7641873.8449, Val Loss: 4285518.8440, Val RMSE: 2071.3093, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 022, Train Loss: 7640758.3430, Val Loss: 4285506.8402, Val RMSE: 2071.3345, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 023, Train Loss: 7639081.2939, Val Loss: 4285432.5652, Val RMSE: 2071.2993, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 024, Train Loss: 7643155.2422, Val Loss: 4285438.7852, Val RMSE: 2071.2913, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 025, Train Loss: 7640523.8197, Val Loss: 4285520.8299, Val RMSE: 2071.3298, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 026, Train Loss: 7639538.2051, Val Loss: 4285493.2391, Val RMSE: 2071.2808, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 027, Train Loss: 7643407.3721, Val Loss: 4285438.5575, Val RMSE: 2071.2715, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 028, Train Loss: 7639552.6039, Val Loss: 4285444.7276, Val RMSE: 2071.2859, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 029, Train Loss: 7640307.4568, Val Loss: 4285421.5575, Val RMSE: 2071.2632, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 030, Train Loss: 7639336.9853, Val Loss: 4285420.9629, Val RMSE: 2071.2627, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 031, Train Loss: 7639674.6158, Val Loss: 4285437.1880, Val RMSE: 2071.2571, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 032, Train Loss: 7641140.5676, Val Loss: 4285432.6292, Val RMSE: 2071.2703, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 033, Train Loss: 7641027.8495, Val Loss: 4285415.1841, Val RMSE: 2071.2539, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 034, Train Loss: 7640781.0416, Val Loss: 4285421.2852, Val RMSE: 2071.2461, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 035, Train Loss: 7642242.1057, Val Loss: 4285859.2596, Val RMSE: 2071.3901, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 036, Train Loss: 7639136.7724, Val Loss: 4285577.4872, Val RMSE: 2071.3020, Val R²: -0.0011


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 037, Train Loss: 7639077.9134, Val Loss: 4285472.4348, Val RMSE: 2071.2661, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 038, Train Loss: 7640526.9043, Val Loss: 4285456.0985, Val RMSE: 2071.2310, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 039, Train Loss: 7641078.8394, Val Loss: 4285455.8529, Val RMSE: 2071.2505, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 040, Train Loss: 7640910.6081, Val Loss: 4285402.6407, Val RMSE: 2071.2180, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 041, Train Loss: 7640083.1131, Val Loss: 4285402.2123, Val RMSE: 2071.2139, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 042, Train Loss: 7640121.9123, Val Loss: 4285566.4207, Val RMSE: 2071.2214, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 043, Train Loss: 7639492.9891, Val Loss: 4285402.2251, Val RMSE: 2071.1995, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 044, Train Loss: 7639648.5724, Val Loss: 4285394.3696, Val RMSE: 2071.1985, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 045, Train Loss: 7641984.8653, Val Loss: 4285393.5601, Val RMSE: 2071.1909, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 046, Train Loss: 7640817.8042, Val Loss: 4285390.5294, Val RMSE: 2071.1890, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 047, Train Loss: 7638275.5005, Val Loss: 4285444.2276, Val RMSE: 2071.1843, Val R²: -0.0009


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 048, Train Loss: 7642158.0948, Val Loss: 4285391.3836, Val RMSE: 2071.1833, Val R²: -0.0009


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 049, Train Loss: 7638927.3632, Val Loss: 4285467.6368, Val RMSE: 2071.2122, Val R²: -0.0010


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 050, Train Loss: 7642132.2116, Val Loss: 4285423.2660, Val RMSE: 2071.1672, Val R²: -0.0009


In [58]:
# 최적 모델 로드
model.load_state_dict(torch.load('best_model.pth'))

# 최종 검증 데이터에 대한 평가
val_loss, val_rmse, val_r2 = evaluate(val_loader)
print(f'Final Val Loss: {val_loss:.4f}, Val RMSE: {val_rmse:.4f}, Val R²: {val_r2:.4f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Final Val Loss: 4285423.2660, Val RMSE: 2071.1672, Val R²: -0.0009


### Smiles 를 분자 서술자 전처리하여 진행

In [59]:
from rdkit.Chem import Descriptors

# SMILES 문자열을 분자 서술자로 변환하는 함수
def smiles_to_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # RDKit의 주요 서술자 계산 (대표적인 서술자 몇 가지 예시)
        descriptors = [
            Descriptors.MolWt(mol),               # 분자량
            Descriptors.MolLogP(mol),             # 로그P
            Descriptors.NumHDonors(mol),          # 수소 결합 공여자 수
            Descriptors.NumHAcceptors(mol),       # 수소 결합 수용자 수
            Descriptors.TPSA(mol)                 # 극성 표면적
        ]
        return np.array(descriptors)
    else:
        return np.zeros(5)  # 서술자 개수에 맞춰 0으로 채운 벡터 반환

In [60]:
# 모든 SMILES에 대해 smiles_to_descriptors 적용
descriptors = np.array([smiles_to_descriptors(smiles) for smiles in train_df['Smiles']])

# 결과를 새로운 열로 추가 (각 서술자마다 열 추가)
train_df['MolWt'] = descriptors[:, 0]        # 분자량
train_df['MolLogP'] = descriptors[:, 1]      # 로그P
train_df['NumHDonors'] = descriptors[:, 2]   # 수소 결합 공여자 수
train_df['NumHAcceptors'] = descriptors[:, 3]  # 수소 결합 수용자 수
train_df['TPSA'] = descriptors[:, 4]         # 극성 표면적

train_df['TPSA'].head()

0    250.87
1    106.31
2    115.54
3    106.31
4    215.17
Name: TPSA, dtype: float64

In [62]:
# 기존 atom_info_features, bond_info_features, conformer_features와 결합
X = np.concatenate([
    descriptors,           
], axis=1)

# 타겟 변수 y 설정
y = train_df['IC50_nM']

In [63]:
# 훈련-검증 데이터 분할 (80% 훈련, 20% 검증)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### 피드포워드 신경망 모델

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FeedforwardNN(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(FeedforwardNN, self).__init__()
        # 은닉층 정의
        self.fc1 = nn.Linear(input_size, hidden_sizes[0])
        self.fc2 = nn.Linear(hidden_sizes[0], hidden_sizes[1])
        # 출력층 정의
        self.fc3 = nn.Linear(hidden_sizes[1], output_size)
        # 드롭아웃 정의 (과적합 방지)
        self.dropout = nn.Dropout(p=0.5)
    
    def forward(self, x):
        # 첫 번째 은닉층
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout(x)
        # 두 번째 은닉층
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        # 출력층
        x = self.fc3(x)
        return x


In [73]:
# 하이퍼파라미터 설정
input_size = X_train.shape[1]  # 피처 수
hidden_sizes = [64, 32]         # 은닉층 크기
output_size = 1                 # 출력 크기 (회귀 문제)

# 모델 초기화
model = FeedforwardNN(input_size, hidden_sizes, output_size)

print(model)

FeedforwardNN(
  (fc1): Linear(in_features=5, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


### 손실 함수 및 옵티마이저 정의

In [74]:
# 손실 함수 (회귀 문제이므로 MSELoss 사용)
criterion = nn.MSELoss()

# 옵티마이저 (Adam 사용)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [82]:
# NumPy 배열로 변환 (Series인 경우)
y_train = y_train.to_numpy() if isinstance(y_train, pd.Series) else y_train
y_val = y_val.to_numpy() if isinstance(y_val, pd.Series) else y_val

# PyTorch 텐서로 변환
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).view(-1, 1)

In [83]:
# 텐서 데이터셋 및 DataLoader 생성
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [84]:
# 학습 및 검증 루프
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [85]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * X_batch.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
                
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                val_loss += loss.item() * X_batch.size(0)
        
        val_loss = val_loss / len(val_loader.dataset)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {epoch_loss:.4f} - Val Loss: {val_loss:.4f}")


In [86]:
# 모델 학습
num_epochs = 50
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs)

Epoch 1/50 - Train Loss: 7925651.2450 - Val Loss: 4450786.7452
Epoch 2/50 - Train Loss: 7830906.6173 - Val Loss: 4399340.2238
Epoch 3/50 - Train Loss: 7783428.1847 - Val Loss: 4389596.4488
Epoch 4/50 - Train Loss: 7840738.1481 - Val Loss: 4395032.5013
Epoch 5/50 - Train Loss: 7813262.8467 - Val Loss: 4389461.6963
Epoch 6/50 - Train Loss: 7786782.1098 - Val Loss: 4384179.3900
Epoch 7/50 - Train Loss: 7787118.5329 - Val Loss: 4385342.9827
Epoch 8/50 - Train Loss: 7812674.4049 - Val Loss: 4390235.1125
Epoch 9/50 - Train Loss: 7792932.6922 - Val Loss: 4392020.7084
Epoch 10/50 - Train Loss: 7782021.7943 - Val Loss: 4388384.0550
Epoch 11/50 - Train Loss: 7813551.4380 - Val Loss: 4390181.2142
Epoch 12/50 - Train Loss: 7790224.9891 - Val Loss: 4389594.0371
Epoch 13/50 - Train Loss: 7805955.3318 - Val Loss: 4391493.2327
Epoch 14/50 - Train Loss: 7778825.5045 - Val Loss: 4380262.6886
Epoch 15/50 - Train Loss: 7793876.9278 - Val Loss: 4385200.9668
Epoch 16/50 - Train Loss: 7799348.3003 - Val Loss

### 모델 평가

In [87]:
from sklearn.metrics import mean_squared_error, r2_score

# 모델 평가 함수 정의
def evaluate_model(model, val_loader, device):
    model.eval()  # 평가 모드로 전환
    all_preds = []
    all_trues = []
    
    with torch.no_grad():  # 그래디언트 계산 비활성화
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(device)
            y_batch = y_batch.to(device)
            
            outputs = model(X_batch)
            all_preds.append(outputs.cpu().numpy())
            all_trues.append(y_batch.cpu().numpy())
    
    # 모든 배치의 예측값과 실제값을 하나의 배열로 합침
    preds = np.vstack(all_preds)
    trues = np.vstack(all_trues)
    
    # 성능 지표 계산
    rmse = np.sqrt(mean_squared_error(trues, preds))
    r2 = r2_score(trues, preds)
    
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Validation R²: {r2:.4f}")
    
    return rmse, r2

# 평가 수행
rmse, r2 = evaluate_model(model, val_loader, device)

Validation RMSE: 2092.8210
Validation R²: -0.0220
