In [1]:
# Load model directly
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
import torch
import torch.onnx

tokenizer = RobertaTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = RobertaModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config = RobertaConfig.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")


In [3]:
tokens = tokenizer(["Cc1ncc2c(n1)CN(c1nc(C(=O)N[C@H](C)c3ccc(C(F)(F)F)cc3)cc(=O)[nH]1)C2","CC(C)CN(Cc1cc(Cl)c2c(c1)OCCCO2)C(=O)C(C)CNCc1cccc2[nH]ncc12"], padding=True, return_tensors='pt')
tokens

{'input_ids': tensor([[  0, 267,  21, 298,  22,  71,  12,  82,  21,  13, 286,  12,  71,  21,
         270,  12,  39, 263,  51,  13,  50,  63,  39,  36,  44, 265,  39,  13,
          71,  23, 264,  12,  39,  12,  42, 281,  42,  13,  42,  13, 261,  23,
          13, 261, 263,  51, 272, 290,  65,  21,  13,  39,  22,   2],
        [  0, 262,  12,  39,  13, 286,  12, 267,  21, 261,  12, 278,  13,  71,
          22,  71,  12,  71,  21,  13, 420,  22,  13,  39, 263,  51,  13,  39,
          12,  39,  13, 463,  21, 276,  22,  63, 290,  65, 298, 304,   2,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,


In [6]:
# Pass tokens through model
with torch.no_grad():  # Deactivate gradients for the following code
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state
embeddings

tensor([[[ 1.2583, -0.7915, -0.0595,  ..., -1.3572,  0.4673,  0.8433],
         [ 1.2943, -0.5931, -0.7251,  ...,  0.9675,  0.5271, -0.6003],
         [ 0.0241,  0.5063,  0.2055,  ...,  1.1754, -0.0095, -1.0407],
         ...,
         [ 0.8738, -0.1259, -0.2457,  ...,  0.8148, -1.4340,  2.1456],
         [ 0.7482, -0.7841, -1.3179,  ...,  0.4312,  0.6104,  1.5554],
         [ 2.8033,  0.1085, -0.7306,  ..., -0.4948, -0.6792,  1.0854]],

        [[ 1.8713,  0.7965, -0.4507,  ..., -0.4820, -0.7786,  1.2478],
         [ 1.5011, -0.7081, -0.4598,  ...,  0.7532,  0.4367, -0.7854],
         [ 3.1207, -0.1093, -0.2326,  ...,  0.5469, -0.1305, -0.0347],
         ...,
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139],
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139],
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139]]])

In [5]:
# The last hidden state is the feature vector we're interested in
len(outputs[0])
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 1.2583, -0.7915, -0.0595,  ..., -1.3572,  0.4673,  0.8433],
         [ 1.2943, -0.5931, -0.7251,  ...,  0.9675,  0.5271, -0.6003],
         [ 0.0241,  0.5063,  0.2055,  ...,  1.1754, -0.0095, -1.0407],
         ...,
         [ 0.8738, -0.1259, -0.2457,  ...,  0.8148, -1.4340,  2.1456],
         [ 0.7482, -0.7841, -1.3179,  ...,  0.4312,  0.6104,  1.5554],
         [ 2.8033,  0.1085, -0.7306,  ..., -0.4948, -0.6792,  1.0854]],

        [[ 1.8713,  0.7965, -0.4507,  ..., -0.4820, -0.7786,  1.2478],
         [ 1.5011, -0.7081, -0.4598,  ...,  0.7532,  0.4367, -0.7854],
         [ 3.1207, -0.1093, -0.2326,  ...,  0.5469, -0.1305, -0.0347],
         ...,
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139],
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139],
         [ 1.0350,  0.9091, -0.9591,  ..., -0.8376, -0.6618,  1.4139]]]), pooler_output=tensor([[ 0.2971, -0.8034, -0.4948,  .

In [None]:
torch.onnx.export(model,             
                      args=(tokens['input_ids'],),
                      f="molecule_embedding.onnx",   
                      input_names=['input_ids'],   
                      output_names=['molecule_embeddings'], 
                      opset_version=11)     

In [12]:
import pandas as pd


token_ids = tokens['input_ids'].numpy()  # Convert to numpy array
df = pd.DataFrame(token_ids)


file_path = r'Data\molecule\example.csv'

df.to_csv(file_path, index=False, header=False)
