## Reversed words

### data loading

In [32]:
import pandas as pd
import numpy as np
from datasets import Dataset

In [10]:
input_words = ['apple', 'ball', 'car', 'dog', 'alex', 'white', 'zebra', 'monkey', 'fight', 'joker', 'van', 'queen', 'sharp']
target_words = ['elppa', 'llab', 'rac', 'god', 'xela', 'etihw', 'arbez', 'yeknom', 'thgif', 'rekoj', 'nav', 'neeuq', 'prahs']

In [33]:
df = pd.DataFrame({"inputs" : input_words, "target" : target_words})
dataset = Dataset.from_pandas(df)

In [40]:
dataset

Dataset({
    features: ['inputs', 'target'],
    num_rows: 13
})

### tokenizers

In [37]:
from transformers import T5Tokenizer

In [38]:
model = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [42]:
def preprocessor(example) : 
    model_inputs = tokenizer(
        example['inputs'],
        max_length = 16,
        padding = "max_length",
        truncation = True
    )

    labels = tokenizer(
        example['target'],
        max_length = 16,
        padding = 'max_length',
        truncation = True
    )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [44]:
tokenized_dataset = dataset.map(preprocessor, batched = False)
print(tokenized_dataset[0])

Map: 100%|██████████| 13/13 [00:00<00:00, 1137.12 examples/s]

{'inputs': 'apple', 'target': 'elppa', 'input_ids': [8947, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3, 15, 40, 1572, 9, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}





### train with a trainer model

In [46]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
model = T5ForConditionalGeneration.from_pretrained(model)

In [48]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy="no",
    logging_dir="./logs",
    logging_steps=10,
)

In [49]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()



Step,Training Loss
10,7.8556
20,2.9303
30,2.5732
40,2.1516
50,1.719
60,1.4389
70,1.1819
80,1.006
90,1.034
100,0.8674


TrainOutput(global_step=100, training_loss=2.275792303085327, metrics={'train_runtime': 50.349, 'train_samples_per_second': 12.91, 'train_steps_per_second': 1.986, 'total_flos': 2749130342400.0, 'train_loss': 2.275792303085327, 'epoch': 50.0})

### testing for the new input words

In [50]:
def predict(word) : 
    inputs = tokenizer(word, return_tensors = 'pt')
    output_ids = model.generate(**inputs, max_length = 16)
    return tokenizer.decode(output_ids[0], skip_special_tokens = True)

In [52]:
predict("apple")

'aigus eti tihi'

In [53]:
predict('dog')

'el'