In [30]:
!nvidia-smi

Sat Oct 14 15:25:20 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   73C    P0    35W /  70W |   1065MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [31]:
!pip install datasets transformers sacrebleu



In [32]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer ,TFAutoModelForSeq2SeqLM ,DataCollatorForSeq2Seq , AdamWeightDecay

In [33]:
model = 'google/flan-t5-small'

In [34]:
raw_dataset = load_dataset('findnitai/english-to-hinglish')

In [35]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 189102
    })
})

In [36]:
tokenizer = AutoTokenizer.from_pretrained(model)

In [37]:
tokenizer("Hello People how are you all ?")

{'input_ids': [8774, 2449, 149, 33, 25, 66, 3, 58, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [38]:
max_input_length = 128
max_target_length = 128
source_lang = 'en'
target_lang = 'hi_ng'

In [39]:
def preprocess_function(samples):
  inputs = [text[source_lang] for text in samples['translation']]
  targets = [text[target_lang] for text in samples['translation']]
  model_inputs = tokenizer(inputs , max_length= max_input_length ,truncation=True)
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets , max_length= max_target_length ,truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

In [40]:
preprocess_function(raw_dataset['train'][:2])



{'input_ids': [[363, 31, 7, 8, 564, 13, 8, 1974, 1], [2018, 6, 8, 3, 14369, 35, 11395, 2604, 19, 248, 68, 8, 10531, 6800, 2604, 1330, 3, 9, 385, 731, 3, 9, 1974, 13, 48, 463, 5, 3, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[814, 3, 1258, 3, 3781, 9, 3, 29, 9, 265, 4244, 23, 1], [3, 13363, 8323, 6, 6819, 9, 3, 107, 76, 9, 11395, 2604, 954, 1024, 152, 4244, 23, 6, 90, 2917, 10531, 6800, 2604, 19, 4740, 9, 208, 14748, 3, 1050, 15, 3, 18118, 51, 142, 3, 189, 32, 26, 9, 6511, 50, 5497, 9, 4244, 23, 5, 1]]}

In [41]:
tokenize_datasets = raw_dataset.map(preprocess_function ,batched=True)

In [42]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [43]:
batch_size=16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 100

In [44]:
data_collator = DataCollatorForSeq2Seq(tokenizer , model=model ,return_tensors='tf')

In [45]:
generate_data_collator = DataCollatorForSeq2Seq(tokenizer , model=model ,return_tensors='tf',pad_to_multiple_of=2)

In [46]:
tokenize_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 189102
    })
})

In [47]:
train_dataset = model.prepare_tf_dataset(
    tokenize_datasets['train'],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=data_collator
)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [48]:
optimizer = AdamWeightDecay(learning_rate=learning_rate , weight_decay_rate=weight_decay)

In [49]:
model.compile(optimizer=optimizer)

In [51]:
model.fit(train_dataset,epochs=1)



<keras.src.callbacks.History at 0x7ee8e0892200>

In [53]:
model.save_pretrained('tf_model.h5')

In [54]:
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained('tf_model.h5')

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at tf_model.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [57]:
input_text = 'Hey!! tell me About India'
tokenized = tokenizer([input_text],return_tensors='np')
out = model.generate(**tokenized ,max_length=128)
print(out)

tf.Tensor(
[[   0 9459 1603 4159  354   88 1547    3 1050    3   40   23   63   15
     3   63    9    9   26 1227 5595   15    1]], shape=(1, 23), dtype=int32)


In [58]:
with tokenizer.as_target_tokenizer():
  print(tokenizer.decode(out[0],skip_special_tokens=True))

Hey!! Mujhe India ke liye yaad dilaye
