In [4]:
# from transformers import AutoModelWithLMHead, AutoTokenizer, pipeline
import pandas as pd
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModel
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

tqdm.pandas()

In [5]:
# model = AutoModelWithLMHead.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
# tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
# fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

Some weights of the model checkpoint at seyonec/ChemBERTa-zinc-base-v1 were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def embed_bert_cls(text, model, tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(model.device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    return embeddings[0].cpu().numpy()

In [7]:
df = pd.read_csv("../data/raw/train.csv", index_col=0)

In [8]:
X = df['Smiles'].progress_apply(lambda smiles: embed_bert_cls(smiles, model, tokenizer))

100%|██████████| 5557/5557 [04:57<00:00, 18.68it/s]


In [9]:
SEED = 1234
TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = train_test_split(
    X,
    df["Active"].astype(int),
    test_size=TEST_SIZE,
    random_state=SEED,
)

In [23]:
import numpy as np

In [26]:
X_train = np.asarray(X_train.to_list())
X_test = np.asarray(X_test.to_list())
y_train = np.asarray(y_train.to_list()) 
y_test = np.asarray(y_test.to_list())

In [12]:
model = CatBoostClassifier(
    auto_class_weights="SqrtBalanced",
    iterations=3000,
    eval_metric="F1",
)

In [27]:
model.fit(
    X_train,
    y_train,
    use_best_model=True,
    eval_set=(X_test, y_test),
)

Learning rate set to 0.028404
0:	learn: 0.0439472	test: 0.0605836	best: 0.0605836 (0)	total: 448ms	remaining: 22m 24s
1:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 753ms	remaining: 18m 48s
2:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 1.06s	remaining: 17m 35s
3:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 1.32s	remaining: 16m 30s
4:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 1.57s	remaining: 15m 40s
5:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 1.83s	remaining: 15m 12s
6:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 2.07s	remaining: 14m 43s
7:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 2.34s	remaining: 14m 36s
8:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 2.59s	remaining: 14m 22s
9:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 2.84s	remaining: 14m 10s
10:	learn: 0.0000000	test: 0.0000000	best: 0.0605836 (0)	total: 3.08s	remaining: 13m 57s
1

KeyboardInterrupt: 