In [1]:
import textgrid
import os
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorWithPadding, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [32]:
# Function to force character-level tokenization
def format_input(text):
    # return " ".join(text)  # Converts "hello world" → "h e l l o   w o r l d"
    return f"grapheme to phoneme: {" ".join(text)}"

# Custom dataset class
class G2PDataset(Dataset):
    def __init__(self, file_path, max_length=512):
        self.data = pd.read_csv(file_path)
        self.max_length = max_length

        self.data = self.data[self.data["text"].apply(lambda x: len(tokenizer(format_input(x))["input_ids"]) <= self.max_length)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        grapheme_text = self.data.iloc[idx]["text"]
        phoneme_text = self.data.iloc[idx]["phonemes"]

        # Force character-level tokenization
        formatted_input = format_input(grapheme_text)
        # formatted_input = f"grapheme to phoneme: {force_character_tokenization(grapheme_text)}"
        # formatted_input = f"grapheme to phoneme: {grapheme_text}"

        return formatted_input, phoneme_text

# Collate function for dynamic padding
def collate_fn(batch):
    inputs, targets = zip(*batch)

    # Tokenize with dynamic padding (longest in batch)
    input_enc = tokenizer(list(inputs), padding=True, return_tensors="pt", truncation=False)
    target_enc = tokenizer(list(targets), padding=True, return_tensors="pt", truncation=False)

    return {
        "input_ids": input_enc.input_ids,
        "attention_mask": input_enc.attention_mask,
        "labels": target_enc.input_ids,
    }

# Load dataset and dataloader
train_dataset = G2PDataset("dev-clean.csv")
val_dataset = G2PDataset("test-clean.csv")

train_loader = DataLoader(train_dataset, batch_size=24, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=24, collate_fn=collate_fn)

In [33]:
# Load T5 model
device = torch.device("mps" if torch.mps.is_available() else "cpu")
# device = torch.device("cpu")
model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
model.gradient_checkpointing_enable()

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training function
def train_model(model, train_loader, val_loader, epochs=3):
    model.train()
    training_losses = []
    val_losses = []

    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)
        for batch in progress_bar:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            progress_bar.set_postfix(loss=loss.item())
            del input_ids, attention_mask, labels
            torch.mps.empty_cache()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Loss: {avg_train_loss:.4f}")

        training_losses.append(avg_train_loss)

        # Validation
        avg_val_loss = validate_model(model, val_loader)
        val_losses.append(avg_val_loss)

        torch.mps.empty_cache()

    return training_losses, val_losses


# Validation function
def validate_model(model, val_loader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()

    avg_val_loss = total_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    return avg_val_loss



In [5]:
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

In [34]:
training_losses, val_losses = train_model(model, train_loader, val_loader, 1)

Epoch 1/1: 100%|██████████| 111/111 [13:37<00:00,  7.36s/it, loss=1.33] 


Epoch 1, Loss: 2.0759
Validation Loss: 1.0528


In [7]:
# model.save_pretrained("g2p_t5_model")
# tokenizer.save_pretrained("g2p_t5_model")

In [41]:
# input = "even in middle age they were still comely and the old grey haired women at their cottage doors had a dignity not to say majesty of their own"
device = "mps"
input = "i expressed by signs my admiration and pleasure to my guides and they were greatly pleased"
# input = "hello"
input_text = f"translate to german: {input}"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

model.eval()
with torch.no_grad():
    output_ids = model.generate(input_ids, use_cache=True)

phoneme_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
phoneme_output

'ich habe meine Bewunderung und Freude an meine Führer geäußert und sie'