In [5]:
import os

os.chdir('/home/yz979/code/kaggle-perturbation/')
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

train_path = 'data/de_train.parquet'
submit_path = 'data/id_map.parquet'

In [53]:
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoModelForMaskedLM, AutoTokenizer
from sklearn.decomposition import TruncatedSVD

chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
de_train = pd.read_parquet(train_path)
de_train.head()

Unnamed: 0,cell_type,sm_name,sm_lincs_id,SMILES,control,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
0,NK cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.10472,-0.077524,-1.625596,-0.144545,0.143555,...,-0.227781,-0.010752,-0.023881,0.674536,-0.453068,0.005164,-0.094959,0.034127,0.221377,0.368755
1,T cells CD4+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.915953,-0.88438,0.371834,-0.081677,-0.498266,...,-0.494985,-0.303419,0.304955,-0.333905,-0.315516,-0.369626,-0.095079,0.70478,1.096702,-0.869887
2,T cells CD8+,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,-0.387721,-0.305378,0.567777,0.303895,-0.022653,...,-0.119422,-0.033608,-0.153123,0.183597,-0.555678,-1.494789,-0.21355,0.415768,0.078439,-0.259365
3,T regulatory cells,Clotrimazole,LSM-5341,Clc1ccccc1C(c1ccccc1)(c1ccccc1)n1ccnc1,False,0.232893,0.129029,0.336897,0.486946,0.767661,...,0.451679,0.704643,0.015468,-0.103868,0.865027,0.189114,0.2247,-0.048233,0.216139,-0.085024
4,NK cells,Mometasone Furoate,LSM-3349,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,False,4.290652,-0.063864,-0.017443,-0.541154,0.570982,...,0.758474,0.510762,0.607401,-0.123059,0.214366,0.487838,-0.819775,0.112365,-0.122193,0.676629


In [58]:
smiles = de_train['SMILES'].unique()

# generate embeddings for all smiles
embeddings = []
for smile in tqdm(smiles):
    encoded_input = tokenizer(smile, return_tensors="pt", padding=True, truncation=True)
    model_output = chemberta(**encoded_input).logits
    model_output = model_output.squeeze(0)[0].detach()
    embeddings.append(model_output)
embeddings = torch.stack(embeddings)
print(embeddings, embeddings.shape)


# pca
embeddings = embeddings.numpy()
svd = TruncatedSVD(n_components=128, n_iter=10, random_state=42)
svd.fit(embeddings)
embeddings = svd.transform(embeddings)

embeddings = torch.from_numpy(embeddings)
print(embeddings, embeddings.shape)
torch.save(embeddings, 'models/chemberta_embeddings.pt')

100%|██████████| 146/146 [00:01<00:00, 137.87it/s]

tensor([[ 0.3904,  0.0000, -0.1212,  ..., -0.8146,  0.3121, -0.1362],
        [-0.1405,  0.0000, -0.5193,  ..., -0.0138,  0.4988,  0.0556],
        [-0.0302,  0.0000,  0.4371,  ..., -0.2585,  0.1269,  0.5169],
        ...,
        [ 0.2469,  0.0000,  0.2397,  ...,  0.0526,  0.2168,  0.4422],
        [-0.0281,  0.0000,  0.1978,  ...,  0.1043,  0.3731,  0.4968],
        [ 0.0778,  0.0000, -0.0916,  ...,  0.1080, -0.2845,  0.1520]]) torch.Size([146, 600])
tensor([[ 2.1149e+01,  1.5123e-01,  9.5433e+00,  ...,  1.1539e-01,
         -9.3018e-02, -1.5144e-01],
        [-1.1044e+01,  4.4376e+00, -9.0124e+00,  ..., -2.0512e-02,
          3.3142e-03,  6.4464e-02],
        [ 2.3638e+01, -2.4699e+01,  1.0322e+01,  ...,  9.8469e-02,
         -7.6552e-02, -2.3233e-02],
        ...,
        [ 8.0757e+00, -2.1312e+01,  1.9476e+00,  ..., -9.0321e-02,
          1.4035e-01,  5.7635e-02],
        [ 4.9114e+00,  3.0020e+01,  2.3561e+01,  ...,  5.4698e-02,
          2.8922e-02, -1.1128e-02],
        [-4.600




In [54]:
embeddings = torch.load('models/geneformer_embeddings.pt').squeeze(0)

# pca
embeddings = embeddings.detach().numpy()
svd = TruncatedSVD(n_components=128, n_iter=10, random_state=42)
svd.fit(embeddings)
embeddings = svd.transform(embeddings)

embeddings = torch.from_numpy(embeddings)
embeddings = embeddings.unsqueeze(0)
torch.save(embeddings, 'models/geneformer_embeddings_.pt')