## Reverse the words

### input and target data

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    "input_words" : ['apple', 'ball', 'car', 'dog', 'alex', 'white', 'zebra', 'monkey', 'fight', 'joker', 'van', 'queen'],
    "target_words" : ['elppa', 'llab', 'rac', 'god', 'xela', 'etihw', 'arbez', 'yeknom', 'thgif', 'rekoj', 'nav', 'neeuq']
}

### load a pre trained model

In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
dataset = Dataset.from_dict(data)

In [5]:
model_name = 't5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

### tokenize

In [6]:
def preprocessor(data) : 
    model_inputs = tokenizer(
        data['input_words'], padding = 'max_length', truncation = True, max_length = 10
    )

    with tokenizer.as_target_tokenizer() : 
        labels = tokenizer(
            data['target_words'], padding = 'max_length', truncation = True, max_length = 10
        )

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [7]:
tokenize_dataset = dataset.map(preprocessor, batched = False)

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map: 100%|██████████| 12/12 [00:00<00:00, 763.47 examples/s]


### training the model

In [22]:
training_args = TrainingArguments(
    output_dir = './results',
    per_device_train_batch_size = 2,
    num_train_epochs = 10,
    logging_dir = './logs',
    logging_steps = 1,
    save_strategy = "no"
)

In [23]:
trainer = Trainer(
    model = model, 
    args = training_args,
    train_dataset = tokenize_dataset
)

In [24]:
trainer.train()



Step,Training Loss
1,0.8774
2,0.6979
3,0.4938
4,0.2433
5,0.6949
6,0.328
7,0.3938
8,0.925
9,0.5004
10,0.5457


TrainOutput(global_step=60, training_loss=0.5131156225999196, metrics={'train_runtime': 19.6389, 'train_samples_per_second': 6.11, 'train_steps_per_second': 3.055, 'total_flos': 317207347200.0, 'train_loss': 0.5131156225999196, 'epoch': 10.0})

### testing on new words

In [25]:
def predict(word):
    inputs = tokenizer(word, return_tensors="pt")
    outputs = model.generate(**inputs, max_length=10)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [26]:
print(predict("dog"))

elfa


In [27]:
print(predict("apple"))

elma


In [None]:
print(tokenizer.tokenize('dog'))

['▁dog']
