# Train a paraphraser on the mined data

In [2]:
import pandas as pd

In [5]:
xydf = pd.read_csv('../data/xydf.tsv', sep='\t', encoding='utf-8')

# Prepare datasets

In [6]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast,
    get_linear_schedule_with_warmup
)
import torch

In [7]:
model_name = "ceshine/t5-paraphrase-paws-msrp-opinosis"

In [8]:
tokenizer = T5TokenizerFast.from_pretrained(model_name)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
df_train, df_test = train_test_split(xydf, test_size=300)
print(df_train.shape[0], df_test.shape[0])

577477 300


In [11]:
%%time

x1 = tokenizer(df_train.source.tolist(), truncation=True)
y1 = tokenizer(df_train.target.tolist(), truncation=True)
x2 = tokenizer(df_test.source.tolist(), truncation=True)
y2 = tokenizer(df_test.target.tolist(), truncation=True)

CPU times: total: 1min 18s
Wall time: 14.6 s


In [12]:
class PairsDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __getitem__(self, idx):
        assert idx < len(self.x['input_ids'])
        item = {key: val[idx] for key, val in self.x.items()}
        item['decoder_attention_mask'] = self.y['attention_mask'][idx]
        item['labels'] = self.y['input_ids'][idx]
        return item
    
    @property
    def n(self):
        return len(self.x['input_ids'])

    def __len__(self):
        return self.n # * 2
    
train_dataset = PairsDataset(x1, y1)
test_dataset = PairsDataset(x2, y2)
len(train_dataset), len(test_dataset)

(577477, 300)

In [13]:
from torch.utils.data import Dataset, DataLoader

In [14]:
train_dataloader = DataLoader(train_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)
test_dataloader = DataLoader(test_dataset, batch_size=4, drop_last=True, shuffle=True, num_workers=1)

# Fine tune t5

In [15]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)
import torch

In [16]:
checkpoint_name = 'SkolkovoInstitute/t5-paraphrase-paws-msrp-opinosis-paranmt'

In [17]:
model = T5ForConditionalGeneration.from_pretrained(checkpoint_name)

In [18]:
device = torch.device('cuda:0')
model.to(device);

In [19]:
import transformers

In [20]:
from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self):
        return device

In [21]:
from typing import List, Dict, Union
import torch

class DataCollatorWithPadding:
    """
    A custom data collator that takes care of padding the batched data to the maximum length
    in the batch for models that do not handle padding themselves.
    """

    def __init__(self, tokenizer):
        """
        The initializer for the DataCollatorWithPadding class.

        Args:
        tokenizer: The tokenizer that will be used to pad the tokenized inputs.
        """
        self.tokenizer = tokenizer

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        """
        This method makes instances of this class callable. It takes a list of feature dictionaries
        (where each feature dictionary corresponds to the data from one example in the batch)
        and pads the inputs to the maximum sequence length in the batch.

        Args:
        features: A list of dictionaries where each dictionary represents one example. Each dictionary
                  contains the keys 'input_ids' and 'attention_mask' (and optionally 'labels' and 'decoder_attention_mask'
                  if provided), with values being lists of integers or PyTorch tensors.

        Returns:
        A dictionary with keys 'input_ids', 'attention_mask', 'labels', and 'decoder_attention_mask' (if present).
        Each key maps to a PyTorch tensor that has been padded to the same length.
        """

        # Use the tokenizer to pad the batch so that each sequence is the same length.
        # The padding is applied to 'input_ids' and 'attention_mask' which are necessary
        # for models like T5 for the encoder's input.
        batch = self.tokenizer.pad(
            features,
            padding=True,  # Pads to the longest sequence in the batch
        )

        # If 'labels' and 'decoder_attention_mask' are present, pad them as well.
        # This is necessary for the decoder's input in models like T5 during training.
        ybatch = self.tokenizer.pad(
            {'input_ids': batch['labels'], 'attention_mask': batch['decoder_attention_mask']},
            padding=True,  # Pads to the longest sequence in the batch
        )

        # Update the 'labels' and 'decoder_attention_mask' in the original batch dictionary
        # with the padded versions from ybatch.
        batch['labels'] = ybatch['input_ids']
        batch['decoder_attention_mask'] = ybatch['attention_mask']

        # Convert the lists of integers in the batch dictionary to PyTorch tensors.
        # This is required because PyTorch models expect tensor inputs.
        return {k: torch.tensor(v) for k, v in batch.items()}


In [31]:
save_name = 'models/t5-cechine-nmt-mined-detox1'

In [27]:
training_args = TrainingArguments(
    output_dir=save_name,   # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,             # total # of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=300,               # number of warmup steps for learning rate scheduler
    weight_decay=0,                  # strength of weight decay
    learning_rate=3e-5,
    logging_dir='./logs',           # directory for storing logs
    logging_steps=100,
    eval_steps=100,

    evaluation_strategy='steps',
    save_total_limit=1,
    save_steps=5000,
)

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [28]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [29]:
import gc
gc.collect()
torch.cuda.empty_cache();

In [30]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.save_model(save_name)

In [None]:
trainer.evaluate()

In [None]:
model.eval();

In [33]:
inputs = tokenizer('The internal policy of the fucking Trump is stupid.', return_tensors='pt')
inputs = {k: v.to(device) for k, v in inputs.items()}
for t in model.generate(**inputs, num_return_sequences=10, do_sample=False, num_beams=10):
    print(tokenizer.decode(t, skip_special_tokens=True))



the president's internal policy is nonsense.
the internal policy of Trump is nonsense.
the Trump administration's internal policy is crazy.
the president's internal policy is bad.
the internal policy of Trump is bad.
the president's internal policy is wrong.
the Trump administration's internal policy is bad.
the Trump administration's internal policy is wrong.
the Trumpian's internal policy is nonsense.
the Trump administration's internal policy is a bad idea.


In [32]:
model.save_pretrained(save_name)