# Translation using T5 model

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [10]:
import pandas as pd
import re
import string
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Load the data
### https://www.kaggle.com/datasets/dhruvildave/en-fr-translation-dataset

In [5]:
df = pd.read_csv('/content/sample_data/EN-FR.csv')
df

Unnamed: 0.1,Unnamed: 0,en,fr
0,0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,1,Site map,Plan du site
2,2,Feedback,Rétroaction
3,3,Credits,Crédits
4,4,Français,English
...,...,...,...
99995,99995,"• Karen Bron, Acting Director, Innovations, An...","◦ Karen Bron, Directrice par intérim, Directio..."
99996,99996,[ Previous | Table of Contents | Next ],[ Page précédente | Table des matières | Page ...
99997,99997,◦ Implementation of section 41 of the Official...,◦ Mise en œuvre de l'article 41 de la Loi sur ...
99998,99998,To bring the communities together and make the...,Assurer un rapprochement des communautés et un...


## Data preprocessing

In [6]:
# converting every letter to lower case
df['en'] = df['en'].apply(lambda x: str(x).lower())
df['fr'] = df['fr'].apply(lambda x: str(x).lower())

In [7]:
# removing apostrophe from the sentences
df['en'] = df['en'].apply(lambda x: re.sub("'","",x))
df['fr'] = df['fr'].apply(lambda x: re.sub("'","",x))

In [8]:
exclude = set(string.punctuation)
# removing all the punctuations
df['en'] = df['en'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['fr'] = df['fr'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [9]:
# removing digits from the sentences
digit = str.maketrans('','',string.digits)
df['en'] = df['en'].apply(lambda x: x.translate(digit))
df['fr'] = df['fr'].apply(lambda x: x.translate(digit))

## using pretrained model and finetuning it on our dataset

In [11]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to('cuda')

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

### optimizer

In [12]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0001)

### Defining Model

In [13]:
def model_train():
    model.train()
    losses = 0
    X = df['en']
    y = df['fr']
    max_epochs = 15
    n_batches = 32
    for epoch in tqdm(range(max_epochs)):
        for i in tqdm(range(n_batches)):
            # making batches 
            local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
            # preparing the data according to the model input
            batch = tokenizer.prepare_seq2seq_batch(list(local_X),list(local_y),return_tensors='pt').to('cuda')
            output = model(**batch)
            # loss can be taken directly from the model output
            loss = output.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses = losses+loss
    average = losses/len(df)
    print('Loss: ' + str(average) )
    
    return model

In [14]:
model = model_train()

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(0.0010, device='cuda:0', grad_fn=<DivBackward0>)


In [17]:
a = model.generate(**tokenizer.prepare_seq2seq_batch(['Hello , what is your name?'],return_tensors='pt').to('cuda'))
tokenizer.batch_decode(a)

['<pad> Bonjour, quel est votre nom?</s>']

In [16]:
torch.save(model , 'translation_model.pkl')