# This scripts is for testing t5 model encoder and decoder seperatly

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

# Ensure the model is in evaluation mode
model.eval()


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [3]:
# Input text
input_text = "Translate English to French: The house is wonderful."
inputs = tokenizer(input_text, return_tensors="pt")
print('tokenized input: ', inputs)
print('input shape: ', inputs['input_ids'].shape)
# Forward pass through the encoder
with torch.no_grad():
    encoder_outputs = model.encoder(input_ids=inputs.input_ids, 
                                    attention_mask=inputs.attention_mask)

# The encoder outputs object includes last hidden state
encoder_last_hidden_state = encoder_outputs.last_hidden_state
print("Encoder output shape:", encoder_last_hidden_state.shape)


tokenized input:  {'input_ids': tensor([[30355,    15,  1566,    12,  2379,    10,    37,   629,    19,  1627,
             5,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
input shape:  torch.Size([1, 12])
Encoder output shape: torch.Size([1, 12, 512])


In [4]:
# Forward pass through the decoder
decoder_input_ids = tokenizer("<pad>", return_tensors="pt").input_ids  # Dummy input for the decoder

with torch.no_grad():
    # You may need to adjust decoder inputs depending on specific requirements
    decoder_outputs = model.decoder(input_ids=decoder_input_ids, 
                                    encoder_hidden_states=encoder_outputs.last_hidden_state)

# Generate text from decoder output
generated_ids = model.generate(encoder_outputs=encoder_outputs)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated text:", generated_text)


Generated text: La maison est merveilleuse.




## Molecules

In [8]:
from transformers import AutoTokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("laituan245/molt5-large-smiles2caption", model_max_length=512)
model = T5ForConditionalGeneration.from_pretrained('laituan245/molt5-large-smiles2caption')

# Put the model in evaluation mode
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [9]:
# Example SMILES for a molecule (e.g., Caffeine)
input_smiles = "CN1C=NC2=C1C(=O)N(C(=O)N2C)C"

# Tokenize the SMILES input
inputs = tokenizer("Caption this molecule: " + input_smiles, return_tensors="pt")
print('tokenized input: ', inputs)
print('input shape: ', inputs['input_ids'].shape)

# Encoder part: get latent representations
with torch.no_grad():
    encoder_outputs = model.encoder(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask)

    
print("Encoder output shape:", encoder_last_hidden_state.shape)

tokenized input:  {'input_ids': tensor([[ 4000,  1575,    48,     3, 23098,    10,     3, 10077,   536,   254,
          2423,  8137,   357,  2423,   254,   536,   254,   599,  2423,   667,
            61,   567,   599,   254,   599,  2423,   667,    61,   567,   357,
           254,    61,   254,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
input shape:  torch.Size([1, 34])
Encoder output shape: torch.Size([1, 12, 512])


In [10]:
# Generate output using the model
generated_ids = model.generate(input_ids=inputs['input_ids'],
                               attention_mask=inputs['attention_mask'],
                               max_length=50,  # Set a suitable max length for captions
                               num_beams=5,  # Use beam search for better quality
                               early_stopping=True)

# Decode generated ids back to text
generated_caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated Caption:", generated_caption)


Generated Caption: The molecule is a trimethylxanthine in which the three methyl groups are located at positions 1, 3, and 7. A purine alkaloid that occurs naturally in tea and coffee. It has a role as a
