In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModelForSequenceClassification.from_pretrained("seyonec/ChemBERTa-zinc-base-v1", num_labels = 1)

In [None]:
!pip install rdkit

In [None]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch
from torch.utils.data import Dataset

In [None]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

In [None]:
ROOT_DIR = '/content/drive/MyDrive/JUMP_AI_2024/Open/'

In [None]:
train = pd.read_csv(os.path.join(ROOT_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(ROOT_DIR, 'test.csv'))
train

In [None]:
def tokenize_smiles(smiles):
    return tokenizer(smiles, padding=True, truncation=True, max_length=128, return_tensors='pt')

In [None]:
train_token = [tokenize_smiles(smiles) for smiles in train['Smiles']]
test_token = [tokenize_smiles(smiles) for smiles in test['Smiles']]

In [None]:
y = train['pIC50'].values

In [None]:
train_token, val_token, y_train, y_val = train_test_split(train_token, y, test_size=0.2, random_state=42)

In [None]:
class SMILESDataset(Dataset):
    def __init__(self, tokens, targets=None):
        self.tokens = tokens
        self.targets = targets

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
        item = {key: val.squeeze(0) for key, val in self.tokens[idx].items()}
        if self.targets is not None:
            item['labels'] = torch.tensor(self.targets[idx], dtype=torch.float)
        return item

In [None]:
train_dataset = SMILESDataset(train_token, y_train)
val_dataset = SMILESDataset(val_token, y_val)
test_dataset = SMILESDataset(test_token)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
def pIC50_to_IC50_nM(pIC50_values):
    return 10**(-pIC50_values) * 1e9

predicted_pIC50 = predictions.predictions
predicted_IC50_nM = pIC50_to_IC50_nM(predicted_pIC50)

In [None]:
submit = pd.read_csv(os.path.join(ROOT_DIR, 'sample_submission.csv'))
submit['IC50_nM'] = predicted_IC50_nM
submit.head()

In [None]:
submit.to_csv('ChemBERTa_submit.csv', index=False)