In [48]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW,Adam
from torch.utils.data import Dataset, DataLoader,random_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import train_test_split
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import wandb

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)


def count_parameters_per_layer(model):
    param_counts = {}
    for name, module in model.named_modules():
        param_counts[name] = sum(p.numel() for p in module.parameters() if p.requires_grad)
    return param_counts

# RMSE 손실 함수 정의
def rmse_loss(y_pred, y_true):
    mse = torch.nn.MSELoss()(y_pred, y_true)
    return torch.sqrt(mse)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU!")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU!")
    

CUDA is available. Using GPU!


DataLoader

In [19]:
features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
print(len(features))

7


In [36]:
class SMILESDataset(Dataset):
    def __init__(self, df, max_length, tokenizer):
        self.df = df
        self.tokenizer = tokenizer.encode_plus
        self.max_length = max_length
        
        # 데이터를 텐서로 변환
        self.features_tensors = []
        self.mlm_labels = []
        self.hlm_labels = []
        self.input_ids_list = []
        self.attention_masks_list = []
        
        features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']
        
        features = ['AlogP','Molecular_Weight','Num_H_Acceptors','Num_H_Donors','Num_RotatableBonds','LogD','Molecular_PolarSurfaceArea']

        for _, row in df.iterrows():
            # 데이터 타입을 float로 변환
            row_values = row[features].astype(float).values
            features_tensor = torch.tensor(row_values, dtype=torch.float)
            mlm_label = torch.tensor(row['MLM'], dtype=torch.float)
            hlm_label = torch.tensor(row['HLM'], dtype=torch.float)
            inputs = self.tokenizer(
                row['SMILES'],
                None,
                add_special_tokens=True,
                max_length=self.max_length,
                padding='max_length',
                return_token_type_ids=True,
                truncation=True
            )
            
            input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long)
            attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)
            
            self.features_tensors.append(features_tensor)
            self.mlm_labels.append(mlm_label)
            self.hlm_labels.append(hlm_label)
            self.input_ids_list.append(input_ids)
            self.attention_masks_list.append(attention_mask)
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        return {"input_ids": self.input_ids_list[index], "attention_mask": self.attention_masks_list[index], 'features_tensor': self.features_tensors[index]}, self.mlm_labels[index], self.hlm_labels[index]


Model

In [54]:
import torch.nn as nn
from transformers import RobertaModel

class AttentionBasedRegressor(nn.Module):
    def __init__(self, pretrained_model_name, feature_dim):
        super(AttentionBasedRegressor, self).__init__()
        
        self.encoder = RobertaModel.from_pretrained(pretrained_model_name, output_attentions=True)
        
        self.dropout = nn.Dropout(0.4)
        # Attention score의 가중 평균을 사용하는 리니어
        self.linear_attention = nn.Linear(self.encoder.config.hidden_size, self.encoder.config.hidden_size)
        # 추가적인 특징을 학습하는 리니어
        self.linear_features = nn.Linear(feature_dim,feature_dim)
        self.linear_feat_to_regressor = nn.Linear(feature_dim, self.encoder.config.hidden_size)
        self.relu = nn.LeakyReLU()
        
        # 두 출력을 결합하여 최종 출력을 생성하는 리니어
        self.regressor = nn.Linear(self.encoder.config.hidden_size * 2, 1)
        

    def forward(self, input_ids, attention_mask=None, features=None):
        outputs = self.encoder(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        attention_scores = outputs.attentions[-1]
        attention_weights = attention_scores.mean(dim=1)
        attention_weights_avg = attention_weights.mean(dim=-1)
        weighted_avg = torch.sum(sequence_output * attention_weights_avg.unsqueeze(-1), dim=1)

        out_attention = self.relu(self.linear_attention(weighted_avg))
        
        out_features = self.relu(self.linear_features(features))
        out_features = self.relu(self.linear_features(out_features))
        out_features = self.relu(self.linear_feat_to_regressor(out_features))
        
        combined = torch.cat((out_attention, out_features), dim=1)
        combined = self.dropout(out_features)
        
        prediction = self.regressor(out_features)
        return prediction

Parameter Size Check

In [7]:
model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
model = AttentionBasedRegressor(model_version,7).to(device)
layer_params = count_parameters_per_layer(model)

print("decoding_layers:",sum(1 for _ in model.modules()) - sum(1 for _ in model.encoder.modules()))
print("decoding_parameters:",sum(p.numel() for p in model.parameters()) - sum(p.numel() for p in model.encoder.parameters()))
# for layer_name, param_count in layer_params.items():
#     print(f"{layer_name}: {param_count} parameters")

decoding_layers: 6
decoding_parameters: 598273


Train

In [8]:
# train_df = pd.read_csv('./origin_data/train.csv')
# test_df = pd.read_csv('./origin_data/test.csv')
# train_max = train_df['SMILES'].astype(str).apply(len).max()
# test_max = test_df['SMILES'].astype(str).apply(len).max()
# print(train_max)
# print(test_max)

174
96


In [56]:
torch.cuda.empty_cache()
MAX_LEN = 180
epochs = 3
batch_size = 32
lr = 1e-4

# wandb.init(    
#            project='drugformer',
#     # track hyperparameters and run metadata
#     config={
#     "learning_rate": lr,
#     "architecture": "Transformer-Linear",
#     "dataset": "Custom",
#     "epochs": epochs,
#     "batch_size": batch_size
#     }
# )
train_df = pd.read_csv('./origin_data/train.csv').drop(columns=['id'])
# 데이터를 학습 및 검증 세트로 분할
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42)
model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
tokenizer = RobertaTokenizer.from_pretrained(model_version)
# 학습 및 검증 데이터로더 생성
train_dataset = SMILESDataset(train_df, MAX_LEN, tokenizer)
valid_dataset = SMILESDataset(valid_df, MAX_LEN, tokenizer)
feat_dim = 7
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

mlm_model = AttentionBasedRegressor(model_version,feat_dim).to(device)
hlm_model = AttentionBasedRegressor(model_version,feat_dim).to(device)
mlm_optimizer = Adam(mlm_model.parameters(), lr=lr)
hlm_optimizer = Adam(hlm_model.parameters(), lr=lr)


criterion = rmse_loss
scheduler = get_linear_schedule_with_warmup(mlm_optimizer, num_warmup_steps=5, num_training_steps=len(train_dataloader) * epochs)

for epoch in tqdm(range(epochs),desc="Training"):
    model.train()
    for batch in train_dataloader:
        inputs, mlm_label, hlm_label = batch
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        features_tensor = inputs["features_tensor"].to(device)
        mlm_label, hlm_label = mlm_label.to(device), hlm_label.to(device)
        mlm_optimizer.zero_grad()        

        mlm_pred = model(input_ids=input_ids, attention_mask=attention_mask, features=features_tensor)[0]

        #rmse계산
        train_loss_mlm = criterion(mlm_pred.squeeze(), mlm_label)
        #train_loss_hlm = criterion(hlm_pred.squeeze(), hlm_label)
        #train_total_loss = train_loss_mlm*0.5 + train_loss_hlm*0.5
        train_loss_mlm.backward()#역전파
        mlm_optimizer.step()
        scheduler.step()  # 학습률 스케줄러 업데이트
    
    # 검증 부분
    model.eval()
    valid_loss_mlm = 0
    valid_loss_hlm = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            inputs, mlm_label, hlm_label = batch
            input_ids = inputs["input_ids"].to(device)
            attention_mask = inputs["attention_mask"].to(device)
            features_tensor = inputs["features_tensor"].to(device)
            mlm_label, hlm_label = mlm_label.to(device), hlm_label.to(device)    
                    
            mlm_pred = model(input_ids=input_ids, attention_mask=attention_mask, features=features_tensor)[0]
            loss_mlm = criterion(mlm_pred.squeeze(), mlm_label)
            #loss_hlm = criterion(hlm_pred.squeeze(), hlm_label)
            #valid_total_loss = loss_hlm*0.5 + loss_mlm*0.5
        
    print(f"Epoch {epoch+1}/{epochs}, Train MLM Loss: {train_loss_mlm}, Valid MLM Loss: {loss_mlm}")
    #wandb.log({'Train Total Loss': train_total_loss, 'Valid Total Loss': valid_total_loss,'Train MLM Loss': train_loss_mlm, 'Train HLM Loss': train_loss_hlm, 'Valid MLM Loss': loss_mlm, 'Valid HLM Loss': loss_hlm})
#wandb.finish()

Training:  33%|███▎      | 1/3 [00:11<00:23, 11.54s/it]

Epoch 1/3, Train MLM Loss: nan, Valid MLM Loss: nan


Training:  67%|██████▋   | 2/3 [00:23<00:11, 11.59s/it]

Epoch 2/3, Train MLM Loss: nan, Valid MLM Loss: nan


Training: 100%|██████████| 3/3 [00:34<00:00, 11.57s/it]

Epoch 3/3, Train MLM Loss: nan, Valid MLM Loss: nan





Prediction

Model Save

In [None]:
torch.save(model.state_dict(), f'./transformer_model/{NAME}.pth')

불러오기

In [None]:
# 저장된 파라미터 로드
# model.load_state_dict(torch.load('model_parameters.pth'))