In [17]:
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW,Adam
from torch.utils.data import Dataset, DataLoader,random_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import random
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import wandb

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

set_seed(42)


def count_parameters_per_layer(model):
    param_counts = {}
    for name, module in model.named_modules():
        param_counts[name] = sum(p.numel() for p in module.parameters() if p.requires_grad)
    return param_counts

# RMSE 손실 함수 정의
def rmse_loss(y_pred, y_true):
    mse = torch.nn.MSELoss()(y_pred, y_true)
    return torch.sqrt(mse)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available. Using GPU!")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU!")
    

CUDA is available. Using GPU!


DataLoader

In [2]:
#텐서로 변환, SMILES토큰화
class SMILESDataset(Dataset):
    #변수선언
    def __init__(self,smile_list,mlm_labels,hlm_labels,max_length,tokenizer):
        self.smile_list = smile_list.tolist()
        self.mlm_labels = mlm_labels.tolist()
        self.hlm_labels = hlm_labels.tolist()
        self.tokenizer = tokenizer.encode_plus
        self.max_length = max_length
    #smiles의 길이 출력 (편의)
    def __len__(self):
        return len(self.smile_list)
    #attention변환 및 tensor변환
    def __getitem__(self, index):
        mlm_label = torch.tensor(self.mlm_labels[index], dtype=torch.float)
        hlm_label = torch.tensor(self.hlm_labels[index], dtype=torch.float)
        inputs = self.tokenizer(
                    self.smile_list[index],
                    None,
                    add_special_tokens=True,
                    max_length=self.max_length,  # 최대 길이 설정
                    padding='max_length',  # 패딩 옵션 추가 SMILES마다 길이가 다른걸 맞춰줌
                    return_token_type_ids=True,
                    truncation=True
                    )
        del inputs['token_type_ids']  # token_type_ids를 제거
        input_ids = torch.tensor(inputs["input_ids"], dtype=torch.long)
        attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long)
    
        return {"input_ids": input_ids, "attention_mask": attention_mask}, mlm_label, hlm_label

Model

In [3]:
class AttentionBasedRegressor(nn.Module):
    def __init__(self, pretrained_model_name):
        super(AttentionBasedRegressor, self).__init__()
        self.encoder = RobertaModel.from_pretrained(pretrained_model_name, output_attentions=True)
        
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(self.encoder.config.hidden_size, self.encoder.config.hidden_size)
        self.relu = nn.ReLU()
        
        # 최종 예측을 위한 Linear 레이어
        self.regressor = nn.Linear(self.encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.encoder(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        attention_scores = outputs.attentions[-1]
        attention_weights = attention_scores.mean(dim=1)
        attention_weights_avg = attention_weights.mean(dim=-1)
        weighted_avg = torch.sum(sequence_output * attention_weights_avg.unsqueeze(-1), dim=1)


        x = self.fc1(weighted_avg)
        x = self.relu(x)
        x = self.dropout(x)
        
        mlm_prediction = self.regressor(x)
        hlm_prediction = self.regressor(x)
        return mlm_prediction, hlm_prediction


Parameter Size Check

In [4]:
model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
model = AttentionBasedRegressor(model_version).to(device)
layer_params = count_parameters_per_layer(model)

print("decoding_layers:",sum(1 for _ in model.modules()) - sum(1 for _ in model.encoder.modules()))
print("decoding_parameters:",sum(p.numel() for p in model.parameters()) - sum(p.numel() for p in model.encoder.parameters()))
# for layer_name, param_count in layer_params.items():
#     print(f"{layer_name}: {param_count} parameters")

decoding_layers: 5
decoding_parameters: 591361


Train

In [5]:
train_df = pd.read_csv('./origin_data/train.csv')
test_df = pd.read_csv('./origin_data/test.csv')
train_max = train_df['SMILES'].astype(str).apply(len).max()
test_max = test_df['SMILES'].astype(str).apply(len).max()
print(train_max)
print(test_max)

174
96


In [16]:
torch.cuda.empty_cache()
MAX_LEN = 178
epochs = 10
batch_size = 32
lr = 1e-5
wandb.init(project='drugformer')
wandb.init(
    # set the wandb project where this run will be logged
    project="my-awesome-project",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": lr,
    "architecture": "Transformer",
    "dataset": "Custom",
    "epochs": epochs,
    "batch_size": batch_size
    }
)
train_df = pd.read_csv('./origin_data/train.csv')
# 데이터셋 분할
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])


smiles_list_train = train_df['SMILES']
mlm_labels = train_df['MLM']
hlm_labels = train_df['HLM']



model_version = 'seyonec/PubChem10M_SMILES_BPE_450k'
tokenizer = RobertaTokenizer.from_pretrained(model_version)
dataset = SMILESDataset(smiles_list_train, mlm_labels, hlm_labels, MAX_LEN, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = AttentionBasedRegressor(model_version).to(device)
optimizer = Adam(model.parameters(), lr=lr)
criterion = rmse_loss
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=10, num_training_steps=len(dataloader) * epochs)

for epoch in tqdm(range(epochs),desc="Training"):
    for batch in dataloader:
        inputs, mlm_label, hlm_label = batch
        inputs = {key: value.to(device) for key, value in inputs.items()}
        mlm_label, hlm_label = mlm_label.to(device), hlm_label.to(device)
        optimizer.zero_grad()
        mlm_pred, hlm_pred = model(**inputs)
        #rmse계산
        loss_mlm = criterion(mlm_pred.squeeze(), mlm_label)
        loss_hlm = criterion(hlm_pred.squeeze(), hlm_label)
        total_loss = loss_mlm*0.5 + loss_hlm*0.5
        total_loss.backward()#역전파
        optimizer.step()
        scheduler.step()  # 학습률 스케줄러 업데이트
        wandb.log({'Total Loss': total_loss/len(dataloader) ,'MLM_Loss':loss_mlm.item()/len(dataloader) ,'HLM Loss':loss_hlm.item()/len(dataloader),'epoch':epoch})
    print(f"Epoch {epoch+1}/{epochs}, Total Loss: {total_loss/len(dataloader)} MLM_Loss:{loss_mlm.item()/len(dataloader)} HLM Loss:{loss_hlm.item()/len(dataloader)}")
wandb.finish()

Training:  10%|█         | 1/10 [00:14<02:10, 14.49s/it]

Epoch 1/10, Total Loss: 0.43960899114608765 MLM_Loss:0.39234660755504264 HLM Loss:0.4868714072487571


Training:  20%|██        | 2/10 [00:28<01:54, 14.28s/it]

Epoch 2/10, Total Loss: 0.3818061351776123 MLM_Loss:0.2852050261064009 HLM Loss:0.47840728759765627


Training:  30%|███       | 3/10 [00:42<01:39, 14.21s/it]

Epoch 3/10, Total Loss: 0.36632204055786133 MLM_Loss:0.2918754924427379 HLM Loss:0.4407685713334517


Training:  40%|████      | 4/10 [00:57<01:25, 14.32s/it]

Epoch 4/10, Total Loss: 0.3844561278820038 MLM_Loss:0.37461190657182175 HLM Loss:0.39430039145729756


Training:  50%|█████     | 5/10 [01:11<01:11, 14.32s/it]

Epoch 5/10, Total Loss: 0.33576780557632446 MLM_Loss:0.3372519406405362 HLM Loss:0.33428369001908737


Training:  60%|██████    | 6/10 [01:26<00:57, 14.36s/it]

Epoch 6/10, Total Loss: 0.40697982907295227 MLM_Loss:0.40199168812144886 HLM Loss:0.4119680578058416


Training:  70%|███████   | 7/10 [01:40<00:42, 14.33s/it]

Epoch 7/10, Total Loss: 0.36764761805534363 MLM_Loss:0.383680551702326 HLM Loss:0.351614691994407


Training:  80%|████████  | 8/10 [01:54<00:28, 14.30s/it]

Epoch 8/10, Total Loss: 0.2719874083995819 MLM_Loss:0.25629914023659445 HLM Loss:0.28767573616721415


Training:  90%|█████████ | 9/10 [02:08<00:14, 14.29s/it]

Epoch 9/10, Total Loss: 0.3407610356807709 MLM_Loss:0.3253228967840021 HLM Loss:0.35619919516823506


Training: 100%|██████████| 10/10 [02:23<00:00, 14.31s/it]

Epoch 10/10, Total Loss: 0.3600318133831024 MLM_Loss:0.34064781882546163 HLM Loss:0.37941585887562146





0,1
HLM Loss,▇▇█▆▆▄▅▇▆▅▄▇▆▄▄▅▃▄▃▄▃▅▄▃▂▃▄▃▃▄▃▂▁▃▃▃▃▂▃▃
MLM_Loss,▆▇█▆▆▆▅▆▆▄▄▇▅▃▅▅▄▄▂▅▂▄▃▄▂▃▄▄▃▃▃▂▁▄▃▃▄▃▂▂
Total Loss,▇▇█▆▆▅▅▇▆▅▄▇▆▄▄▅▄▄▃▅▂▄▄▃▂▃▄▃▃▃▃▂▁▄▃▃▄▃▃▃
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▆▇▇▇████

0,1
HLM Loss,0.37942
MLM_Loss,0.34065
Total Loss,0.36003
epoch,9.0


Prediction

In [None]:
#제출물 이름
NAME = "transformer_attention_feat_linear_relu_batch64"
# 1. 모델을 평가 모드로 전환
model.eval()

# 2. 예측을 위한 입력 데이터 준비
test_df = pd.read_csv('./origin_data/test.csv')
test_smiles = test_df['SMILES'].tolist()
encoded_inputs = tokenizer(test_smiles, return_tensors="pt", max_length=MAX_LEN,padding=True, truncation=True)

input_ids = encoded_inputs["input_ids"].to(device)
attention_mask = encoded_inputs["attention_mask"].to(device)

# 3. 예측 수행
with torch.no_grad():
    mlm_preds, hlm_preds = model(input_ids, attention_mask=attention_mask)

# 4. 제출
mlm_predictions = mlm_preds.cpu().numpy()
hlm_predictions = hlm_preds.cpu().numpy()

submission = pd.read_csv('./origin_data/sample_submission.csv')
submission['MLM'] = mlm_predictions
submission['HLM'] = hlm_predictions
submission.to_csv(f'./submission/{NAME}.csv',index=False)

Model Save

In [None]:
torch.save(model, f'./transformer_model/{NAME}.pth')

불러오기

In [None]:
# model = torch.load('full_model.pth')
# model.eval()  # 모델을 평가 모드로 설정 (필요한 경우)