In [10]:
from IPython.display import clear_output as clr
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [11]:
chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

chemberta.eval()
def featurize_ChemBERTa(smiles_list, padding=True):
    embeddings_cls = torch.zeros(len(smiles_list), 600)
    embeddings_mean = torch.zeros(len(smiles_list), 600)

    with torch.no_grad():
        for i, smiles in enumerate(tqdm(smiles_list)):
            encoded_input = tokenizer(smiles, return_tensors="pt",padding=padding,truncation=True)
            model_output = chemberta(**encoded_input)
            
            embedding = model_output[0][::,0,::]
            embeddings_cls[i] = embedding
            
            embedding = torch.mean(model_output[0],1)
            embeddings_mean[i] = embedding
            
    return embeddings_cls.numpy(), embeddings_mean.numpy()
clr()

In [12]:
chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
clr()
smiles_list = ['Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1','CN1CCC[C@H]1c2cccnc2']
chemberta.eval()
embeddings_cls = torch.zeros(len(smiles_list), 600)
embeddings_mean = torch.zeros(len(smiles_list), 600)

with torch.no_grad():
    for i, smiles in enumerate(tqdm(smiles_list)):
        print(smiles)
        encoded_input = tokenizer(smiles, return_tensors="pt",padding=False,truncation=True)
        print(encoded_input['input_ids'].shape)
        model_output = chemberta(**encoded_input)
        print(model_output[0].shape)
        
        embedding = model_output[0][::,0,::]
        embeddings_cls[i] = embedding

        embedding = torch.mean(model_output[0],1)
        embeddings_mean[i] = embedding
            
print(embeddings_cls.numpy().shape)
print(embeddings_mean.numpy().shape)

100%|██████████| 2/2 [00:00<00:00, 126.95it/s]

Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1
torch.Size([1, 39])
torch.Size([1, 39, 600])
CN1CCC[C@H]1c2cccnc2
torch.Size([1, 18])
torch.Size([1, 18, 600])
(2, 600)
(2, 600)





In [13]:
fn = '/kaggle/input/open-problems-single-cell-perturbations/de_train.parquet'
df_de_train = pd.read_parquet(fn)

In [14]:
feat_train = df_de_train[['cell_type', 'sm_name', 'SMILES', ]]

sm_name2smiles = {
        name: smiles 
        for name, smiles 
        in feat_train.drop_duplicates(subset='sm_name').iloc[::,1:].values
    }

In [15]:
fn = '/kaggle/input/open-problems-single-cell-perturbations/id_map.csv'
df_id_map = pd.read_csv(fn)

In [16]:
df_id_map['SMILES'] = [sm_name2smiles[name] for name in df_id_map.sm_name.values]

In [17]:
train_cls_pad_true, train_mean_pad_true = featurize_ChemBERTa(df_de_train.SMILES)
test_cls_pad_true, test_mean_pad_true = featurize_ChemBERTa(df_id_map.SMILES)

train_cls_pad_false, train_mean_pad_false = featurize_ChemBERTa(df_de_train.SMILES, padding=False)
test_cls_pad_false, test_mean_pad_false = featurize_ChemBERTa(df_id_map.SMILES, padding=False)

100%|██████████| 614/614 [00:05<00:00, 103.65it/s]
100%|██████████| 255/255 [00:02<00:00, 112.27it/s]
100%|██████████| 614/614 [00:05<00:00, 111.03it/s]
100%|██████████| 255/255 [00:02<00:00, 113.13it/s]


In [18]:
np.save('train_sm_name.npy', df_de_train.sm_name.values)
np.save('train_ChemBERTa_v2_77MTR_cls_pad_True.npy', train_cls_pad_true)
np.save('train_ChemBERTa_v2_77MTR_mean_pad_True.npy', train_mean_pad_true)
np.save('train_ChemBERTa_v2_77MTR_cls_pad_False.npy', train_cls_pad_false)
np.save('train_ChemBERTa_v2_77MTR_mean_pad_False.npy', train_mean_pad_false)

np.save('test_sm_name.npy', df_id_map.sm_name.values)
np.save('test_ChemBERTa_v2_77MTR_cls_pad_True.npy', test_cls_pad_true)
np.save('test_ChemBERTa_v2_77MTR_mean_pad_True.npy', test_mean_pad_true)
np.save('test_ChemBERTa_v2_77MTR_cls_pad_False.npy', test_cls_pad_false)
np.save('test_ChemBERTa_v2_77MTR_mean_pad_False.npy', test_mean_pad_false)