In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForMaskedLM


In [9]:
# 데이터 불러오기
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# BERT 토크나이저 초기화

tokenizer = AutoTokenizer.from_pretrained("mrm8488/chEMBL_smiles_v1")
model = AutoModelForMaskedLM.from_pretrained("mrm8488/chEMBL_smiles_v1")

Some weights of the model checkpoint at mrm8488/chEMBL_smiles_v1 were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
class SMILESDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        smiles = str(self.data.loc[index, 'SMILES'])
        targets = self.data.loc[index, 'MLM']
        
        inputs = self.tokenizer.encode_plus(
            smiles,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(targets, dtype=torch.float)
        }

# 파라미터 설정
MAX_LEN = 150
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4

# 데이터 분할
train, val = train_test_split(train_data, test_size=0.1)

# 데이터셋 및 데이터로더 생성
train_dataset = SMILESDataset(train, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

val_dataset = SMILESDataset(val, tokenizer, MAX_LEN)
val_loader = DataLoader(val_dataset, batch_size=VALID_BATCH_SIZE)


In [12]:
class TransformerRegressor(nn.Module):
    def __init__(self):
        super(TransformerRegressor, self).__init__()
        self.bert = BertModel.from_pretrained('/mrm8488/chEMBL26_smiles_v1')
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, 1)
        
    def forward(self, ids, mask):
        _, pooled_output = self.bert(ids, attention_mask=mask, return_dict=False)
        output = self.drop(pooled_output)
        return self.out(output)


In [13]:
# 모델, 손실 함수, 최적화 알고리즘 초기화
model = TransformerRegressor().to(device)
loss_fn = nn.MSELoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# 학습
EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    for _, data in enumerate(train_loader):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        targets = data['targets'].to(device)

        outputs = model(ids, mask).squeeze()
        loss = loss_fn(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 검증
    model.eval()
    final_outputs = []
    final_targets = []
    with torch.no_grad():
        for _, data in enumerate(val_loader):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            targets = data['targets'].to(device)
            outputs = model(ids, mask).squeeze()
            final_outputs.extend(outputs.tolist())
            final_targets.extend(targets.tolist())

    rmse = np.sqrt(mean_squared_error(final_targets, final_outputs))
    print(f"Epoch {epoch + 1}/{EPOCHS} - Validation RMSE: {rmse:.4f}")


HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/mrm8488/chEMBL26_smiles_v1'. Use `repo_type` argument if needed.

In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import mean_squared_error
import torch.nn as nn

# 데이터 불러오기
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 토크나이저와 모델 로딩
tokenizer = BertTokenizer.from_pretrained('mrm8488/chEMBL26_smiles_v1')
model = BertForSequenceClassification.from_pretrained('mrm8488/chEMBL26_smiles_v1', num_labels=1)

# 데이터 토크나이징
train_encodings = tokenizer(list(train_data['SMILES']), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(test_data['SMILES']), truncation=True, padding=True, max_length=512)

# DataLoader 준비
train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_data['MLM'].values))
test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_data['MLM'].values))

# 학습 설정
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,  # More epochs
    per_device_train_batch_size=16,  # Bigger batch size
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=50,
    eval_steps=50,
    evaluation_strategy="steps"
)

optimizer = AdamW(model.parameters(), lr=3e-4, correct_bias=False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=len(train_dataset)*10)
criterion = nn.MSELoss()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    optimizers=(optimizer, scheduler),
    compute_metrics=lambda p: {'rmse': mean_squared_error(p.label_ids, p.predictions)}
)

# 모델 학습
trainer.train()

# MLM, HLM 예측
train_predictions = trainer.predict(train_dataset)
test_predictions_MLM = trainer.predict(test_dataset)

# HLM 학습 및 예측
train_dataset_HLM = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_data['HLM'].values))
test_dataset_HLM = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_data['HLM'].values))

trainer.train_dataset = train_dataset_HLM
trainer.eval_dataset = test_dataset_HLM

trainer.train()

test_predictions_HLM = trainer.predict(test_dataset_HLM)


OSError: mrm8488/chEMBL26_smiles_v1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`