In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

# Load the CSV file
df = pd.read_csv('/data/servilla/DT_HGNN/data/Dusty_data/chebi_name_smiles_definition_183.csv')

# Load KV-PLM pre-trained model for SMILES + domain knowledge
tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
kvplm_model = AutoModel.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')

# Tokenizing SMILES strings from the CSV file
smiles_list = df['SMILES'].tolist()
encoded_input = tokenizer(smiles_list, padding=True, truncation=True, return_tensors='pt', max_length=128)
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']

# Forward pass through the KV-PLM model
with torch.no_grad():
    outputs = kvplm_model(input_ids=input_ids, attention_mask=attention_mask)
    smiles_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token embedding

# Convert the embeddings to a DataFrame
df_embeddings = pd.DataFrame(smiles_embeddings.numpy())

# Add the 'ChEBI ID' column to the embeddings DataFrame
df_embeddings['ChEBI ID'] = df['ChEBI ID'].values

# Set 'ChEBI ID' as the index
df_embeddings.set_index('ChEBI ID', inplace=True)

# Save embeddings to a CSV file
df_embeddings.to_csv('/data/servilla/DT_HGNN/data/Test/ChemBerta_SMILES_emb_183.csv')

print("Embeddings saved with 'ChEBI ID' as the index.")




Embeddings saved with 'ChEBI ID' as the index.


In [3]:
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer

# Load the CSV file
csv_file_path = '/data/servilla/DT_HGNN/data/Dusty_data/chebi_name_smiles_definition_183.csv'
df = pd.read_csv(csv_file_path, index_col='ChEBI ID')

# Initialize ChemBERTa model for SMILES and SciBERT model for domain knowledge
chemberta_model = AutoModel.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')
chemberta_tokenizer = AutoTokenizer.from_pretrained('seyonec/ChemBERTa-zinc-base-v1')

scibert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
scibert_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

def get_chemberta_embedding(smiles):
    inputs = chemberta_tokenizer(smiles, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = chemberta_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

def get_scibert_embedding(definition):
    inputs = scibert_tokenizer(definition, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = scibert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

# Process each row and compute embeddings
embeddings = []
for index, row in df.iterrows():
    smiles = row['SMILES']
    definition = row['Definition']
    
    # Get embeddings
    chemberta_emb = get_chemberta_embedding(smiles)
    scibert_emb = get_scibert_embedding(definition)
    
    # Concatenate embeddings (intra-fusion)
    fused_embedding = torch.cat((torch.tensor(chemberta_emb), torch.tensor(scibert_emb)), dim=0)
    
    # Add to the list
    embeddings.append(fused_embedding.numpy())

# Convert embeddings to DataFrame and save as CSV
embeddings_df = pd.DataFrame(embeddings, index=df.index)
output_csv_file_path = '/data/servilla/DT_HGNN/data/Embeddings/test_conc.csv'
embeddings_df.to_csv(output_csv_file_path)

print(f"Embeddings saved to {output_csv_file_path}")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Embeddings saved to /data/servilla/DT_HGNN/data/Embeddings/test_conc.csv
