In [17]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW,Adam
from torch.utils.data import Dataset, DataLoader,random_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import os
from sklearn.model_selection import train_test_split
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import wandb

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)


def count_parameters_per_layer(model):
    param_counts = {}
    for name, module in model.named_modules():
        param_counts[name] = sum(p.numel() for p in module.parameters() if p.requires_grad)
    return param_counts

# RMSE 손실 함수 정의
def rmse_loss(y_pred, y_true):
    mse = torch.nn.MSELoss()(y_pred, y_true)
    return torch.sqrt(mse)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU!")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU!")
    

CUDA is available. Using GPU!


DataLoader

In [3]:
#텐서로 변환, SMILES토큰화
class SMILESDataset(Dataset):
    #변수선언
    def __init__(self,smile_list,mlm_labels,hlm_labels,max_length,tokenizer):
        self.smile_list = smile_list.tolist()
        self.mlm_labels = mlm_labels.tolist()
        self.hlm_labels = hlm_labels.tolist()
        self.tokenizer = tokenizer.encode_plus
        self.max_length = max_length
    #smiles의 길이 출력 (편의)
    def __len__(self):
        return len(self.smile_list)
    #attention변환 및 tensor변환
    def __getitem__(self, index):
        mlm_label = torch.tensor(self.mlm_labels[index], dtype=torch.float)
        hlm_label = torch.tensor(self.hlm_labels[index], dtype=torch.float)
        inputs = self.tokenizer(
                    self.smile_list[index],
                    None,
                    add_special_tokens=True,
                    max_length=self.max_length,  # 최대 길이 설정
                    padding='max_length',  # 패딩 옵션 추가 SMILES마다 길이가 다른걸 맞춰줌
                    return_token_type_ids=True,
                    truncation=True
                    )
        del inputs['token_type_ids']  # token_type_ids를 제거
        input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long)
        attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)
    
        return {"input_ids": input_ids, "attention_mask": attention_mask}, mlm_label, hlm_label

Model

In [15]:
class AttentionBasedRegressor(nn.Module):
    def __init__(self, pretrained_model_name):
        super(AttentionBasedRegressor, self).__init__()
        self.encoder = RobertaModel.from_pretrained(pretrained_model_name, output_attentions=True)
        
        self.dropout = nn.Dropout(0.4)
        self.fc1 = nn.Linear(self.encoder.config.hidden_size, self.encoder.config.hidden_size)
        self.relu = nn.ReLU()
        
        # 최종 예측을 위한 Linear 레이어
        self.regressor = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.encoder(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        attention_scores = outputs.attentions[-1]
        attention_weights = attention_scores.mean(dim=1)
        attention_weights_avg = attention_weights.mean(dim=-1)
        weighted_avg = torch.sum(sequence_output * attention_weights_avg.unsqueeze(-1), dim=1)

        x = self.fc1(weighted_avg)
        x = self.relu(x)
        
        x = self.fc1(x)
        x = self.relu(x)
        
        x = self.dropout(x)
        mlm_prediction = self.regressor(x)
        hlm_prediction = self.regressor(x)
        return mlm_prediction, hlm_prediction


Parameter Size Check

In [5]:
model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
model = AttentionBasedRegressor(model_version).to(device)
layer_params = count_parameters_per_layer(model)

print("decoding_layers:",sum(1 for _ in model.modules()) - sum(1 for _ in model.encoder.modules()))
print("decoding_parameters:",sum(p.numel() for p in model.parameters()) - sum(p.numel() for p in model.encoder.parameters()))
# for layer_name, param_count in layer_params.items():
#     print(f"{layer_name}: {param_count} parameters")

decoding_layers: 5
decoding_parameters: 591361


Train

In [8]:
train_df = pd.read_csv('./origin_data/train.csv')
test_df = pd.read_csv('./origin_data/test.csv')
train_max = train_df['SMILES'].astype(str).apply(len).max()
test_max = test_df['SMILES'].astype(str).apply(len).max()
print(train_max)
print(test_max)

174
96


In [21]:
torch.cuda.empty_cache()
MAX_LEN = 178
epochs = 30
batch_size = 32
lr = 1e-5

wandb.init(    
           project='drugformer',
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "Transformer",
    "dataset": "Custom",
    "epochs": epochs,
    "batch_size": batch_size
    }
)
train_df = pd.read_csv('./origin_data/train.csv')
# 데이터를 학습 및 검증 세트로 분할
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)





model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
tokenizer = RobertaTokenizer.from_pretrained(model_version)
# 학습 및 검증 데이터로더 생성
train_dataset = SMILESDataset(train_df['SMILES'], train_df['MLM'], train_df['HLM'], MAX_LEN, tokenizer)
valid_dataset = SMILESDataset(valid_df['SMILES'], valid_df['MLM'], valid_df['HLM'], MAX_LEN, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

model = AttentionBasedRegressor(model_version).to(device)
optimizer = Adam(model.parameters(), lr=lr)
criterion = rmse_loss
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=5, num_training_steps=len(train_dataloader) * epochs)

for epoch in tqdm(range(epochs),desc="Training"):
    model.train()
    for batch in train_dataloader:
        inputs, mlm_label, hlm_label = batch
        inputs = {key: value.to(device) for key, value in inputs.items()}
        mlm_label, hlm_label = mlm_label.to(device), hlm_label.to(device)
        optimizer.zero_grad()
        mlm_pred, hlm_pred = model(**inputs)
        #rmse계산
        train_loss_mlm = criterion(mlm_pred.squeeze(), mlm_label)
        train_loss_hlm = criterion(hlm_pred.squeeze(), hlm_label)
        train_total_loss = train_loss_mlm*0.5 + train_loss_hlm*0.5
        train_total_loss.backward()#역전파
        optimizer.step()
        scheduler.step()  # 학습률 스케줄러 업데이트
    # 검증 부분
    model.eval()
    valid_loss_mlm = 0
    valid_loss_hlm = 0
    with torch.no_grad():
        for batch in valid_dataloader:
            inputs, mlm_label, hlm_label = batch
            inputs = {key: value.to(device) for key, value in inputs.items()}
            mlm_label, hlm_label = mlm_label.to(device), hlm_label.to(device)
            
            mlm_pred, hlm_pred = model(**inputs)
            loss_mlm = criterion(mlm_pred.squeeze(), mlm_label)
            loss_hlm = criterion(hlm_pred.squeeze(), hlm_label)
            valid_total_loss = loss_hlm*0.5 + loss_mlm*0.5
        
    print(f"Epoch {epoch+1}/{epochs}, Valid MLM Loss: {loss_mlm}, Valid HLM Loss: {loss_hlm}")
    wandb.log({'Train Total Loss': train_total_loss, 'Valid Total Loss': valid_total_loss,'Train MLM Loss': train_loss_mlm, 'Train HLM Loss': train_loss_hlm, 'Valid MLM Loss': loss_mlm, 'Valid HLM Loss': loss_hlm})

wandb.finish()

Training:   3%|▎         | 1/30 [00:14<07:02, 14.58s/it]

Epoch 1/30, Valid MLM Loss: 54.88081359863281, Valid HLM Loss: 72.0864486694336


Training:   7%|▋         | 2/30 [00:29<06:45, 14.50s/it]

Epoch 2/30, Valid MLM Loss: 46.8592643737793, Valid HLM Loss: 62.30195617675781


Training:  10%|█         | 3/30 [00:43<06:27, 14.34s/it]

Epoch 3/30, Valid MLM Loss: 38.367889404296875, Valid HLM Loss: 49.345314025878906


Training:  13%|█▎        | 4/30 [00:57<06:09, 14.20s/it]

Epoch 4/30, Valid MLM Loss: 36.09564208984375, Valid HLM Loss: 43.07697296142578


Training:  17%|█▋        | 5/30 [01:11<05:54, 14.18s/it]

Epoch 5/30, Valid MLM Loss: 35.92513656616211, Valid HLM Loss: 42.03936004638672


Training:  20%|██        | 6/30 [01:25<05:41, 14.23s/it]

Epoch 6/30, Valid MLM Loss: 36.08451843261719, Valid HLM Loss: 43.11637878417969


Training:  23%|██▎       | 7/30 [01:39<05:24, 14.12s/it]

Epoch 7/30, Valid MLM Loss: 35.822052001953125, Valid HLM Loss: 42.92634963989258


Training:  27%|██▋       | 8/30 [01:53<05:09, 14.06s/it]

Epoch 8/30, Valid MLM Loss: 32.71649169921875, Valid HLM Loss: 43.061283111572266


Training:  30%|███       | 9/30 [02:07<04:54, 14.03s/it]

Epoch 9/30, Valid MLM Loss: 31.195390701293945, Valid HLM Loss: 37.285362243652344


Training:  33%|███▎      | 10/30 [02:21<04:40, 14.01s/it]

Epoch 10/30, Valid MLM Loss: 31.056629180908203, Valid HLM Loss: 40.094947814941406


Training:  37%|███▋      | 11/30 [02:35<04:25, 13.99s/it]

Epoch 11/30, Valid MLM Loss: 32.01328659057617, Valid HLM Loss: 36.203372955322266


Training:  40%|████      | 12/30 [02:49<04:11, 13.99s/it]

Epoch 12/30, Valid MLM Loss: 32.34675979614258, Valid HLM Loss: 37.10889434814453


Training:  43%|████▎     | 13/30 [03:03<03:59, 14.11s/it]

Epoch 13/30, Valid MLM Loss: 32.1978874206543, Valid HLM Loss: 35.8953971862793


Training:  47%|████▋     | 14/30 [03:18<03:47, 14.23s/it]

Epoch 14/30, Valid MLM Loss: 33.25883102416992, Valid HLM Loss: 33.86742401123047


Training:  50%|█████     | 15/30 [03:32<03:32, 14.14s/it]

Epoch 15/30, Valid MLM Loss: 33.814002990722656, Valid HLM Loss: 35.366905212402344


Training:  53%|█████▎    | 16/30 [03:46<03:17, 14.12s/it]

Epoch 16/30, Valid MLM Loss: 34.78998565673828, Valid HLM Loss: 36.78506088256836


Training:  57%|█████▋    | 17/30 [04:00<03:05, 14.27s/it]

Epoch 17/30, Valid MLM Loss: 34.76375961303711, Valid HLM Loss: 32.880706787109375


Training:  60%|██████    | 18/30 [04:14<02:50, 14.21s/it]

Epoch 18/30, Valid MLM Loss: 35.174530029296875, Valid HLM Loss: 31.21741485595703


Training:  63%|██████▎   | 19/30 [04:28<02:35, 14.15s/it]

Epoch 19/30, Valid MLM Loss: 34.95778274536133, Valid HLM Loss: 33.11790084838867


Training:  67%|██████▋   | 20/30 [04:42<02:21, 14.10s/it]

Epoch 20/30, Valid MLM Loss: 35.343441009521484, Valid HLM Loss: 32.01578903198242


Training:  70%|███████   | 21/30 [04:56<02:06, 14.08s/it]

Epoch 21/30, Valid MLM Loss: 35.56621170043945, Valid HLM Loss: 33.6020393371582


Training:  73%|███████▎  | 22/30 [05:10<01:52, 14.05s/it]

Epoch 22/30, Valid MLM Loss: 35.75987243652344, Valid HLM Loss: 34.228302001953125


Training:  77%|███████▋  | 23/30 [05:25<01:38, 14.11s/it]

Epoch 23/30, Valid MLM Loss: 35.79689025878906, Valid HLM Loss: 32.507293701171875


Training:  80%|████████  | 24/30 [05:39<01:24, 14.04s/it]

Epoch 24/30, Valid MLM Loss: 35.87950134277344, Valid HLM Loss: 32.21288299560547


Training:  83%|████████▎ | 25/30 [05:53<01:10, 14.11s/it]

Epoch 25/30, Valid MLM Loss: 36.125179290771484, Valid HLM Loss: 31.331857681274414


Training:  87%|████████▋ | 26/30 [06:07<00:56, 14.16s/it]

Epoch 26/30, Valid MLM Loss: 36.308502197265625, Valid HLM Loss: 31.397075653076172


Training:  90%|█████████ | 27/30 [06:21<00:42, 14.11s/it]

Epoch 27/30, Valid MLM Loss: 36.75664138793945, Valid HLM Loss: 31.78180694580078


Training:  93%|█████████▎| 28/30 [06:35<00:28, 14.19s/it]

Epoch 28/30, Valid MLM Loss: 36.848209381103516, Valid HLM Loss: 32.01435470581055


Training:  97%|█████████▋| 29/30 [06:49<00:14, 14.08s/it]

Epoch 29/30, Valid MLM Loss: 36.809303283691406, Valid HLM Loss: 31.71328353881836


Training: 100%|██████████| 30/30 [07:03<00:00, 14.12s/it]

Epoch 30/30, Valid MLM Loss: 36.83705139160156, Valid HLM Loss: 31.648778915405273





0,1
Train HLM Loss,█▄▅▄▃▅▄▄▄▃▃▂▂▄▃▂▂▄▃▄▃▁▁▃▃▁▂▂▃▃
Train MLM Loss,█▅▄▄▄▄▃▅▄▃▄▃▃▃▂▂▂▄▃▃▃▂▃▂▁▂▃▂▂▂
Train Total Loss,█▄▄▄▃▄▄▅▃▃▃▂▃▄▂▂▂▄▃▃▃▁▂▂▂▁▃▂▂▂
Valid HLM Loss,█▆▄▃▃▃▃▃▂▃▂▂▂▁▂▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁
Valid MLM Loss,█▆▃▂▂▂▂▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃
Valid Total Loss,█▆▃▂▂▂▂▂▁▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
Train HLM Loss,27.98411
Train MLM Loss,19.56586
Train Total Loss,23.77498
Valid HLM Loss,31.64878
Valid MLM Loss,36.83705
Valid Total Loss,34.24292


Prediction

In [None]:
#제출물 이름
NAME = "transformer_attention_feat_linear_relu_batch64"
# 1. 모델을 평가 모드로 전환
model.eval()

# 2. 예측을 위한 입력 데이터 준비
test_df = pd.read_csv('./origin_data/test.csv')
test_smiles = test_df['SMILES'].tolist()
encoded_inputs = tokenizer(test_smiles, return_tensors="pt", max_length=MAX_LEN,padding=True, truncation=True)

input_ids = encoded_inputs["input_ids"].to(device)
attention_mask = encoded_inputs["attention_mask"].to(device)

# 3. 예측 수행
with torch.no_grad():
    mlm_preds, hlm_preds = model(input_ids, attention_mask=attention_mask)

# 4. 제출
mlm_predictions = mlm_preds.cpu().numpy()
hlm_predictions = hlm_preds.cpu().numpy()

submission = pd.read_csv('./origin_data/sample_submission.csv')
submission['MLM'] = mlm_predictions
submission['HLM'] = hlm_predictions
submission.to_csv(f'./submission/{NAME}.csv',index=False)

Model Save

In [None]:
torch.save(model.state_dict(), f'./transformer_model/{NAME}.pth')

불러오기

In [None]:
# 저장된 파라미터 로드
# model.load_state_dict(torch.load('model_parameters.pth'))