In this noetbook we go over fine tuning Helsinki-NLP/opus-mt-en-fr for translating French to English. The dataset used here is publically available on kaggle: https://www.kaggle.com/dhruvildave/en-fr-translation-dataset.

# Downloading and Importing libraries

In [2]:
! pip install datasets transformers sacrebleu torch sentencepiece transformers[sentencepiece]

Collecting datasets
  Downloading datasets-1.14.0-py3-none-any.whl (290 kB)
[K     |████████████████████████████████| 290 kB 5.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 37.8 MB/s 
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 10.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 40.2 MB/s 
Collecting huggingface-hub<0.1.0,>=0.0.19
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.7 MB/s 
[?25hCollecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 64.7 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2021.10

In [3]:
import transformers
print(transformers.__version__)

4.12.2


In [4]:
import pandas as pd
import re
import string
import torch
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

# Dataset downloading and preprocessing

In [5]:
from google.colab import files

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"surabhigovil","key":"a2383170fa9bf2b8de65b2bf4b3f8363"}'}

In [6]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

In [7]:
! chmod 600 ~/.kaggle/kaggle.json

In [8]:
! kaggle datasets download -d dhruvildave/en-fr-translation-dataset --force

Downloading en-fr-translation-dataset.zip to /content
100% 2.54G/2.54G [00:18<00:00, 130MB/s]
100% 2.54G/2.54G [00:18<00:00, 147MB/s]


In [9]:
! mkdir train

In [10]:
! unzip en-fr-translation-dataset.zip -d train

Archive:  en-fr-translation-dataset.zip
  inflating: train/en-fr.csv         


In [11]:
#using only 200000 rows
r_rows = 200000
df = pd.read_csv('/content/train/en-fr.csv' , nrows = r_rows)

In [12]:
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


## Preprocessing data

In [13]:
# converting every letter to lower case
df['fr'] = df['fr'].apply(lambda x: str(x).lower())
df['en'] = df['en'].apply(lambda x: str(x).lower())
# removing apostrophe from the sentences
df['fr'] = df['fr'].apply(lambda x: re.sub("'","",x))
df['en'] = df['en'].apply(lambda x: re.sub("'","",x))
exclude = set(string.punctuation)
# removing all the punctuations
df['fr'] = df['fr'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['en'] = df['en'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
# removing digits from the sentences
digit = str.maketrans('','',string.digits)
df['fr'] = df['fr'].apply(lambda x: x.translate(digit))
df['en'] = df['en'].apply(lambda x: x.translate(digit))

In [14]:
# using pretrained model and then finetunig it on our dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr").to('cuda')

Downloading:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/784k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287M [00:00<?, ?B/s]

In [23]:
def model_train(source_lang, target_lang):
    model.train()
    losses = 0
    X = df[source_lang]
    y = df[target_lang]
    max_epochs = 15
    n_batches = 32
    for epoch in tqdm(range(max_epochs)):
        for i in tqdm(range(n_batches)):
            # making batches 
            local_X, local_y = X[i*n_batches:(i+1)*n_batches,], y[i*n_batches:(i+1)*n_batches,]
            # preparing the data according to the model input
            batch = tokenizer.prepare_seq2seq_batch(list(local_X),list(local_y),return_tensors='pt').to('cuda')
            output = model(**batch)
            # loss can be taken directly from the model output
            loss = output.loss
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            losses = losses+loss
            wandb.log({"epoch": epoch, "loss": losses.cpu().data.numpy().argmax()})
            print('Loss: ' + str(losses/len(df)) )
    average = losses/len(df)
    print('Loss: ' + str(average) )
    
    return model

In [16]:
! pip install wandb

Collecting wandb
  Downloading wandb-0.12.6-py2.py3-none-any.whl (1.7 MB)
[?25l[K     |▏                               | 10 kB 25.7 MB/s eta 0:00:01[K     |▍                               | 20 kB 27.8 MB/s eta 0:00:01[K     |▋                               | 30 kB 11.5 MB/s eta 0:00:01[K     |▉                               | 40 kB 9.5 MB/s eta 0:00:01[K     |█                               | 51 kB 5.2 MB/s eta 0:00:01[K     |█▏                              | 61 kB 5.8 MB/s eta 0:00:01[K     |█▍                              | 71 kB 5.6 MB/s eta 0:00:01[K     |█▋                              | 81 kB 6.2 MB/s eta 0:00:01[K     |█▊                              | 92 kB 4.8 MB/s eta 0:00:01[K     |██                              | 102 kB 5.1 MB/s eta 0:00:01[K     |██▏                             | 112 kB 5.1 MB/s eta 0:00:01[K     |██▍                             | 122 kB 5.1 MB/s eta 0:00:01[K     |██▌                             | 133 kB 5.1 MB/s eta 0:00:01[

In [17]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [24]:
wandb.init(project="translation-transformer", entity="surabhigovil1")

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
loss,"tensor(0.0005, devic..."


In [21]:
optimizer = torch.optim.AdamW(model.parameters(),lr=0.0001)

In [25]:
model = model_train('en', 'fr')

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Loss: tensor(1.6929e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.8956e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(3.8365e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.3781e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(5.4348e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(6.5775e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.2822e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(9.3920e-07, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0570e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2050e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3344e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5271e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.7383e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8362e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.9328e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(3.9322e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.0268e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.1211e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.1725e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.3541e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.4157e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.5104e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.6239e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.7226e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.8958e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(4.9904e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(5.1837e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(5.3707e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(5.4992e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(5.5930e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(7.2407e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.3307e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.4024e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.4449e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.7333e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.7938e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.9140e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(7.9842e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.0721e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.1896e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.2799e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.4151e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.5383e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.6628e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(8.7637e-06, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.0149e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0238e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0321e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0368e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0479e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0524e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0621e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0689e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0767e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0874e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.0989e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.1098e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.1184e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.1258e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.1329e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.2592e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2680e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2733e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2764e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2803e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2838e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2904e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.2956e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3012e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3102e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3164e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3251e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3326e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3389e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.3446e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.4532e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4591e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4653e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4707e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4737e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4765e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4832e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4903e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.4963e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5059e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5113e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5169e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5228e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5299e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.5381e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.6312e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6369e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6401e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6456e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6540e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6567e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6613e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6682e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6779e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6883e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.6951e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.7011e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.7054e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.7114e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.7160e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.8034e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8079e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8136e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8183e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8230e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8416e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8460e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8506e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8560e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8668e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8706e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8774e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8822e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8873e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(1.8932e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(1.9967e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0007e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0060e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0082e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0221e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0266e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0322e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0373e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0403e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0479e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0526e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0587e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0637e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0679e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.0740e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.1630e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1658e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1687e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1700e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1772e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1800e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1895e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1942e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.1975e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2060e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2105e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2140e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2177e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2210e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.2244e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.3053e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3111e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3143e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3172e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3230e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3249e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3284e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3330e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3363e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3431e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3458e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3540e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3577e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3636e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.3665e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.4368e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4399e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4425e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4445e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4479e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4508e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4563e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4606e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4647e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4703e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4739e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4771e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4801e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4851e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.4915e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.5730e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5758e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5816e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5869e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5904e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5920e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.5994e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6042e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6094e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6138e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6160e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6215e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6259e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6288e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.6344e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.7249e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7273e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7319e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7404e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7431e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7455e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7499e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7554e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7668e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7734e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7761e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7799e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7846e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7880e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.7936e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

  0%|          | 0/32 [00:00<?, ?it/s]

Loss: tensor(2.8887e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.8929e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.8991e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9023e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9084e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9109e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9146e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9197e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9246e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9318e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9367e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9421e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9472e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9507e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tensor(2.9558e-05, device='cuda:0', grad_fn=<DivBackward0>)
Loss: tens

In [60]:
a = model.generate(**tokenizer.prepare_seq2seq_batch(['Nice weather'],return_tensors='pt').to('cuda'))
tokenizer.batch_decode(a)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



['<pad> Le ciel est beau']

In [18]:
torch.save(model , 'model.pkl')

In [None]:
# Then later:
model = torch.load('/content/model.pkl')

In [26]:
def translate_to_english(English):
  a = model.generate(**tokenizer.prepare_seq2seq_batch([English],return_tensors='pt').to('cuda'))
  text = tokenizer.batch_decode(a)
  text = str(text)
  text = re.sub("<pad> ","",text)
  text = re.sub("'","",text)
  text = text.replace("[", "")
  text = text.replace("]", "")
  return text

In [27]:
import gradio as gr


gr.Interface(translate_to_english,
    [
        gr.inputs.Textbox(lines=7, label="English")
    ],
    gr.outputs.Textbox(label="Translated text"),
    title="Translate to English",
    ).launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted
Running on External URL: https://48019.gradio.app


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7861/',
 'https://48019.gradio.app')

In [79]:
 wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,▁▁▁▂▃▃▃▃▄▄▅▅▅▅▆▇▇▇▇█▁▁▁▂▃▃▃▃▄▅▅▅▅▆▆▇▇▇▇█
loss,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,14
loss,0


In [18]:
! pip install datasets transformers sacrebleu



In [46]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tnrange
from sklearn.utils import shuffle

In [None]:
movieDf = pd.read_csv('drive/MyDrive/MoviePlotsModels/data/wiki_movie_plots_deduped.csv')

In [47]:
!pip install sentencepiece
!pip install transformers
!pip install torch
!pip install rich[jupyter]

Collecting rich[jupyter]
  Downloading rich-10.12.0-py3-none-any.whl (212 kB)
[K     |████████████████████████████████| 212 kB 5.1 MB/s 
Collecting commonmark<0.10.0,>=0.9.0
  Downloading commonmark-0.9.1-py2.py3-none-any.whl (51 kB)
[K     |████████████████████████████████| 51 kB 9.0 MB/s 
Installing collected packages: commonmark, rich
Successfully installed commonmark-0.9.1 rich-10.12.0


In [48]:
import os
import re
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tnrange
from sklearn.utils import shuffle
import pickle
import math
## use if working on jupyter notebook or colab
from IPython.display import clear_output

In [49]:
import numpy as np
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rich.table import Column, Table
from rich import box
from rich.console import Console
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class T5Dataset(Dataset):
  def __init__(self, tokenizer, data, source_len, target_len):
    super(T5Dataset, self).__init__()
    self.tokenizer = tokenizer
    self.source_len = source_len
    self.target_len = target_len
    self.data = data
  def __len__(self):
    return len(self.data)
  def __getitem__(self, index):
    source_seq = self.data[index]['source']
    target_seq = self.data[index]['target']
    source = self.tokenizer.batch_encode_plus(
        [source_seq],
        max_length = self.source_len,
        pad_to_max_length = True,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt"
    )
    target = self.tokenizer.batch_encode_plus(
        [target_seq],
        max_length = self.target_len,
        pad_to_max_length = True,
        truncation = True,
        padding = "max_length",
        return_tensors = "pt"
    )
    source_ids = source["input_ids"].squeeze()
    source_mask = source["attention_mask"].squeeze()
    target_ids = target["input_ids"].squeeze()
    target_mask = target["attention_mask"].squeeze()
    return {
        "source_ids": source_ids,
        "source_mask": source_mask,
        "target_ids": target_ids,
        "target_mask": target_mask
    }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    total_loss = 0
    total_counts = 0
    for _, data in enumerate(tqdm_notebook(loader, desc = "Train DL")):
        y = data["target_ids"].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype = torch.long)
        mask = data["source_mask"].to(device, dtype = torch.long)
        optimizer.zero_grad()
        outputs = model(
            input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, labels = lm_labels
        )
        loss = outputs[0]
        total_counts += 1
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    return total_loss/total_counts

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    total_loss = 0
    total_counts = 0
    with torch.no_grad():
        for _, data in enumerate(tqdm_notebook(loader, desc = "Valid DL")):
            y = data["target_ids"].to(device, dtype = torch.long)
            y_ids = y[:, :-1].contiguous()
            lm_labels = y[:, 1:].clone().detach()
            lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
            ids = data["source_ids"].to(device, dtype = torch.long)
            mask = data["source_mask"].to(device, dtype = torch.long)
            outputs = model(
            input_ids = ids, attention_mask = mask, decoder_input_ids = y_ids, labels = lm_labels
            )
            loss = outputs[0]
            total_loss += loss.item()
            total_counts += 1
    return total_loss / total_counts

In [None]:
def trainer(

    torch.manual_seed(model_params["SEED"])
    torch.cuda.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True
    
    console.log(f'''Model: Loading {model_params['MODEL']}.....''')
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)
    console.log(f"[DATA]: READING DATA.......")
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )
    # Training loop
    console.log(f"[Initiating Fine Tuning]...\\n")
    ## model save path
    path = os.path.join(output_dir, "model_files")
    
    console.log("Starting with Random Selection")
    ## random selection
    prev_loss = []
    for randomSelection in tnrange(model_params["RANDOM_TRAIN_STEPS"], desc = 'Random Selection'):
        
        copyData = data.copy()
        copyData = shuffle(copyData)
      
        train_size = 0.75
        random_permuts = np.random.permutation(len(copyData))
        train_nums = round(len(random_permuts) * train_size)
        train_dataset = [copyData[i] for i in random_permuts[:train_nums]]
        valid_dataset = [copyData[i] for i in random_permuts[train_nums:]] 
        training_set = T5Dataset(
            tokenizer, train_dataset, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"]
        )
        val_set = T5Dataset(
            tokenizer, valid_dataset, model_params["MAX_SOURCE_TEXT_LENGTH"], model_params["MAX_TARGET_TEXT_LENGTH"]
        )
        train_params = {
            "batch_size": model_params["TRAIN_BATCH_SIZE"],
            "shuffle": True,
            "num_workers": 0
        }
        val_params = {
            "batch_size": model_params["VALID_BATCH_SIZE"],
            "shuffle": False,
            "num_workers": 0
        }
        
        train_dl = DataLoader(training_set, **train_params)
        
        ## training 
        console.log(f'[MODEL TRAINING]')
        clear_output(wait = True)
        for epoch in tnrange(model_params["TRAIN_EPOCHS"], desc = "Training"):
            
            total_loss = train(epoch, tokenizer, model, device, train_dl, optimizer)
            training_logger.add_row(str(randomSelection), str(epoch), str(total_loss))
            console.log(training_logger)
            if epoch == 0:
              console.log(f"Saving Model at epoch: {epoch} with total loss: {total_loss}")
              model.save_pretrained(os.path.join(output_dir, "model_files_initial"))
              tokenizer.save_pretrained(os.path.join(output_dir, "model_files_initial"))
            if epoch > 0:
                if min(prev_loss) > total_loss:
                    console.log(f"Saving Model at epoch: {epoch} with total loss: {total_loss}")
                    model.save_pretrained(path)
                    tokenizer.save_pretrained(path)
            prev_loss.append(total_loss)
        del train_dl, training_set
        ## validation
        valid_dl = DataLoader(val_set, **val_params)
        console.log(f'[MODEL VALIDATION]')
        for epoch in tnrange(model_params["VAL_EPOCHS"], desc = "Validation"):
            val_loss = validate(epoch, tokenizer, model, device, valid_dl)
            
            valid_loggger.add_row(str(randomSelection), str(val_loss))
            console.log(valid_loggger)
        console.save_text(os.path.join(output_dir, f"logs-random-{randomSelection}.txt"))
        console.log(f"[VALIDATAION DONE]")     
        del valid_dl, val_set

Final

In [51]:
from datasets import load_dataset, load_metric

metric = load_metric("sacrebleu")

Downloading:   0%|          | 0.00/2.37k [00:00<?, ?B/s]

In [58]:
model_checkpoint='Helsinki-NLP/opus-mt-en-fr'

In [59]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [55]:
tokenizer("Hello, this one sentence!")

{'input_ids': [10537, 2, 67, 151, 5776, 145, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [56]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[10537, 2, 67, 151, 5776, 145, 0], [160, 32, 1036, 5776, 3, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]]}

In [60]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "translate English to Romanian: "
else:
    prefix = ""

In [61]:
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "fr"

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs