In [1]:
import torch
import torch.nn.functional as F
import random
from torch.optim import Adam
from transformers import T5ForConditionalGeneration, T5Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the T5 tokenizer and models
tokenizer = T5Tokenizer.from_pretrained("t5-small")
english_to_lingo_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
lingo_to_english_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
embedding_model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)

In [None]:
# Define the training data
with open("./gutenstuff/train/10.txt", "r") as f:
    lines = [i.strip() for i in f.readlines()]
    lines = [i for i in lines if i]
    english_texts = [lines[0]]
    for i in lines:
        if len(english_texts[-1]) < 500:
            english_texts[-1] += "\n" + i
        else:
            english_texts.append(i)

In [None]:
def get_random(arr, count):
    out = []
    l = len(arr)
    while len(out) < count:
        v = random.randrange(l)
        if v not in out:
            out.append(v)
    return [arr[i] for i in out]

In [19]:
# Define the environment that rewards Lingo translations
class LingoTranslationEnvironment:
    def __init__(self, tokenizer, english_to_lingo_model, lingo_to_english_model, embedding_model):
        self.tokenizer = tokenizer
        self.english_to_lingo_model = english_to_lingo_model
        self.lingo_to_english_model = lingo_to_english_model
        self.embedding_model = embedding_model

    def get_reward(self, english_tokens, lingo_tokens, back_to_english_text):
        # Convert texts to embeddings
        english_embedding = self._get_text_embedding(english_tokens, False)
        back_to_english_embedding = self._get_text_embedding(back_to_english_text, True)

        # Compute similarity between the original English text and back-to-English translation
        similarity = F.cosine_similarity(
            english_embedding.unsqueeze(0),
            back_to_english_embedding.unsqueeze(0),
        )

        # Compute the reward based on translation similarity and Lingo token length
        print(similarity, lingo_tokens)
        reward = similarity - 0.1 * lingo_tokens.size()[1]

        return reward.item()

    def _get_text_embedding(self, inputs, tokenize):
        if tokenize:
            inputs = self.tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(device)

        # Create a tensor with the same shape as input_ids filled with the decoder_start_token_id
        decoder_input_ids = torch.full_like(inputs["input_ids"], self.embedding_model.config.decoder_start_token_id)

        outputs = self.embedding_model(**inputs, decoder_input_ids=decoder_input_ids, output_hidden_states=True)
        hidden_states = outputs.encoder_hidden_states[-1]  # Get the last hidden state
        return hidden_states.mean(dim=1).squeeze()  # Average the token embeddings and remove the batch dimension


In [20]:
# Initialize the environment
env = LingoTranslationEnvironment(tokenizer, english_to_lingo_model, lingo_to_english_model, embedding_model)

In [21]:
# Train the agent using Proximal Policy Optimization (PPO)
def train_agent_ppo(english_texts, num_epochs, num_rollouts, optimizer):
    inputs = [tokenizer(i, return_tensors="pt", padding=True, truncation=True).to(device) for i in english_texts]
    for epoch in range(num_epochs):
        rewards = []
        for entry in get_random(inputs, 20):
            for _ in range(num_rollouts):
                with torch.no_grad():
                    # Generate Lingo translation
                    lingo_translation = english_to_lingo_model.generate(entry.input_ids)
                    
                    # Generate back-to-English translation
                    back_to_english_tokens = lingo_to_english_model.generate(lingo_translation)
                    back_to_english_text = tokenizer.decode(back_to_english_tokens[0], skip_special_tokens=True)

                # Calculate the reward for this rollout
                reward = env.get_reward(entry, lingo_translation, back_to_english_text)
                rewards.append(reward)

        # Compute the average reward
        avg_reward = sum(rewards) / len(rewards)
        print(f"Epoch {epoch + 1}/{num_epochs}: Average reward = {avg_reward}")

        # Update the models using the PPO algorithm
        optimizer.zero_grad()
        loss = -torch.tensor(avg_reward, requires_grad=True)  # Minimize the negative reward
        loss.backward()
        optimizer.step()

In [23]:
# Define the optimizer
optimizer = Adam(list(english_to_lingo_model.parameters()) + list(lingo_to_english_model.parameters()), lr=5e-5)

# Train the agent
num_epochs = 500
num_rollouts = 1
train_agent_ppo(english_texts, num_epochs, num_rollouts, optimizer)

KeyboardInterrupt: 