In [1]:
!pip install datasets
!pip install transformers
!pip install accelerate -U
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [2]:
from datasets import load_dataset, load_metric
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TrainingArguments, Trainer
from transformers import pipeline
import json
import os
from google.colab import drive
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
models_path = '/content/gdrive/My Drive/models/'
model_name = 'seq2seq'
model_path = os.path.join(models_path,model_name)
print(model_path)

/content/gdrive/My Drive/models/seq2seq


In [5]:
data = load_dataset('kde4',lang1='en',lang2='fr')

Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/8.45k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

In [6]:
small = data['train'].shuffle(seed=42).select(range(1_000))
split = small.train_test_split(seed=42)

In [7]:
checkpoint = 'Helsinki-NLP/opus-mt-en-fr'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



In [9]:
en = split['train'][5]['translation']['en']
fr = split['train'][5]['translation']['fr']
en,fr

('You can either pick a file or enter its name in the Location: box.',
 'Vous pouvez soit choisir un fichier soit saisir son nom dans la zone de texte Emplacement.')

In [11]:
inputs = tokenizer(en)
inputs

{'input_ids': [213, 115, 1828, 8437, 15, 1437, 57, 3307, 96, 1129, 18, 4, 4577, 37, 5311, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [12]:
with tokenizer.as_target_tokenizer():
  targets = tokenizer(fr)
targets



{'input_ids': [344, 1069, 345, 4094, 34, 2428, 345, 9315, 113, 689, 31, 8, 1283, 5, 1470, 21708, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenizer.convert_ids_to_tokens(targets['input_ids'])

['▁Vous',
 '▁pouvez',
 '▁soit',
 '▁choisir',
 '▁un',
 '▁fichier',
 '▁soit',
 '▁saisir',
 '▁son',
 '▁nom',
 '▁dans',
 '▁la',
 '▁zone',
 '▁de',
 '▁texte',
 '▁Emplacement',
 '.',
 '</s>']

In [19]:
max_input_len = 128
max_target_len = 128

In [26]:
def tokenize_fn(batch):
  inputs = [x['en'] for x in batch['translation']]
  targets = [x['fr'] for x in batch['translation']]
  tokenized_inputs = tokenizer(inputs,max_length=max_input_len,truncation=True)
  with tokenizer.as_target_tokenizer():
    tokenized_targets = tokenizer(targets,max_length=max_target_len,truncation=True)
  tokenized_inputs['labels'] = tokenized_targets['input_ids']
  return tokenized_inputs

In [27]:
tokenized_datasets = split.map(
    tokenize_fn,
    batched=True,
    remove_columns=split['train'].column_names
)

Map:   0%|          | 0/750 [00:00<?, ? examples/s]



Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [22]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [24]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer,model)

In [29]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(1,3)])

In [33]:
batch['decoder_input_ids']

tensor([[59513,   526,  3261,     0, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513],
        [59513, 34023,     5,  3948,     5,   372,   402, 38492,   350,   823,
            95, 13439,     2, 25985,    14,     6,  3996,     3]])

In [34]:
tokenizer.convert_ids_to_tokens(batch['decoder_input_ids'][0])

['<pad>',
 '▁K',
 'DE',
 '</s>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [35]:
tokenizer.convert_ids_to_tokens(batch['labels'][0])

['▁K',
 'DE',
 '</s>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']

In [36]:
!pip install sacrebleu bert-score

Collecting sacrebleu
  Downloading sacrebleu-2.3.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.9/118.9 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.7.0-py2.py3-none-any.whl (15 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu, bert-score
Successfully installed bert-score-0.3.13 colorama-0.4.6 portalocker-2.7.0 sacrebleu-2.3.1


In [37]:
from datasets import load_metric

In [39]:
bleu_metric = load_metric('sacrebleu')

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [43]:
bert_metric = load_metric('bertscore')
bert_metric.compute(
    predictions=['I like cats'],
    references=[['I love cats']],
    lang='en'
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.9803369045257568],
 'recall': [0.9803369045257568],
 'f1': [0.9803369045257568],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.31.0)'}

In [44]:
def compute_metrics(preds_and_labels):
  preds,labels = preds_and_labels
  decoded_preds = tokenizer.batch_decode(preds,skip_special_tokens=True)
  labels = np.where(labels != -100,labels,tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels,skip_special_tokens=True)
  decoded_preds = [pred.strip() for pred in decoded_preds]
  decoded_labels = [[label.strip() for label in decoded_labels]]
  bleu = bleu_metric.compute(predictions=decoded_preds,references=decoded_labels)
  bert = bert_metric.compute(predictions=decoded_preds,references=decoded_labels,lang='fr')
  return {"bleu":bleu['score'],'bert_score':np.mean(bert_score['f1'])}

In [48]:
from transformers import Seq2SeqTrainingArguments
training_args = Seq2SeqTrainingArguments(
    model_path,
    evaluation_strategy='no',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True
)

In [49]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [54]:
trainer.evaluate()

ValueError: ignored

In [None]:
trainer.save_model(model_path)

In [None]:
from transformers import pipeline
translator = pipeline('translation')