In [1]:
!pip -q install sentence-transformers

from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.optim as optim
import torch

import prettytable
from prettytable import PrettyTable

pd.set_option('display.max_colwidth', None)

In [2]:
model_t5 = SentenceTransformer('sentence-transformers/gtr-t5-base')

def sharpened_cosine_similarity(vec1, vec2, exponent=3):
    cosine_similarity = torch.nn.functional.cosine_similarity(vec1, vec2, dim=0)
    return cosine_similarity ** exponent

def compare_phrases(test_phrase, phrases):
    print(f"{test_phrase}")
    table = PrettyTable(align = "l", max_table_width = 80, hrules = prettytable.ALL, vrules = prettytable.ALL)
    
    score_column_title = "  T5 "
    table.field_names = [f" --- Comparison Prompt (best score to worst) --- ", score_column_title]
    table.sortby = score_column_title
    table.reversesort = True

    test_embedding = model_t5.encode(test_phrase, convert_to_tensor=True, show_progress_bar=False)
    print(test_embedding, test_embedding.size())
    output = model_t5.decode(test_embedding)
    print(output)
    
    for phrase in phrases:
        compare_embedding = model_t5.encode(phrase, convert_to_tensor=True, show_progress_bar=False)
        score = sharpened_cosine_similarity(test_embedding, compare_embedding).item()
        table.add_row([phrase, f"   {format(score, '.2f')}    "])

    print(table)
    
    return

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

2_Dense/pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

In [3]:
# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Define the input sentence and task
input_sentence = "The cat sat on the mat."
task = "translate English to French: "

# Combine the task and input sentence
input_text = task + input_sentence

# Encode the input text
input_ids = tokenizer.encode(input_text, return_tensors='pt')
print(input_ids)

# Generate the output
output_ids = model.generate(input_ids)

# Decode the output IDs to get the output sentence
output_sentence = tokenizer.decode(output_ids[0])

print(output_sentence)



spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tensor([[13959,  1566,    12,  2379,    10,    37,  1712,     3,     7,   144,
            30,     8,  6928,     5,     1]])




<pad> Le chat s'est assis sur le tapis.</s>


In [4]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')
input_sentence = "Rewrite the essay with a main character that is a sentient dog"
tokens = tokenizer.tokenize(input_sentence)

model = T5ForConditionalGeneration.from_pretrained('t5-base')
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([input_ids])  # Convert to tensor
embeddings = model.get_input_embeddings()(input_ids)

encoder_outputs = model.get_encoder()(inputs_embeds=embeddings)
decoder_outputs = model.get_decoder()(inputs_embeds=encoder_outputs[0])

output_ids = torch.argmax(decoder_outputs[0], dim=-1)
print(output_ids)
output_tokens = tokenizer.convert_ids_to_tokens(output_ids[0])
output_sentence = tokenizer.convert_tokens_to_string(output_tokens)

print(output_sentence)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


tensor([[ 54,  54,  54,  54,  54,  54,  54,  54,  54,  54, 219,  54,  54,  54,
          54,  54]])
can can can can can can can can can can auf can can can can can


In [5]:
# actual_prompt = "Rewrite the essay with a main character that is a sentient dog"

# predicted_prompts = [
#     "Add a paragraph explaining that the website is a simulation created by a sentient computer named Nova.",
#     "Compose a paragraph detailing how Nova, a sentient computer, dog the test."
# #     "Rethink the text to include a self-aware computer.",
# #     "Recreate the text with a sentient computer playing a major role.",
# #     "Rewrite the essay with a main character that is a dog.",
# #     "Reword the writting with an updated main character.",
# #     "Rewrite the essay with a character from Star Wars."
# ]

# compare_phrases(actual_prompt, predicted_prompts)

In [6]:
# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        return x

# Hyperparameters
input_dim = 200
hidden_dim = 128
batch_size = 32

# Initialize the model
model = MLP(input_dim, hidden_dim)

# Define the loss function and optimizer
criterion = nn.MSELoss(reduction='none')
optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training loop
# for epoch in range(100):
#     # Generate sample data with varying target sizes
#     input_data = torch.randn(batch_size, input_dim)
#     target_sizes = torch.randint(low=1, high=40, size=(batch_size,))
#     target_data = [torch.randn(size) for size in target_sizes]

#     # Pack the target data into a padded tensor
#     target_tensor = nn.utils.rnn.pad_sequence(target_data, batch_first=True)

#     # Forward pass
#     output = model(input_data)
#     loss = criterion(output, target_tensor).mean()

#     # Backward pass and optimization
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     if (epoch+1) % 10 == 0:
#         print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')
