In [2]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch


In [3]:
from accelerate import Accelerator
accelerator=Accelerator()
device=accelerator.device
device

device(type='cuda')

Loading dataset

In [5]:
check_point='Helsinki-NLP/opus-mt-en-fr'
data=load_dataset('kde4', lang1='en', lang2='fr')

In [7]:
data=data['train'].train_test_split(test_size=0.1)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 21018
    })
})

In [8]:
data['train'][1]['translation']

{'en': 'How can i send good debug or crash reports?',
 'fr': 'Comment puis -je envoyer de bons rapports de débogue ou de plantage & #160;?'}

Tokenization

In [9]:
tokenizer=AutoTokenizer.from_pretrained(check_point, return_tensors='pt')



In [12]:
#Tokenization
max_length=128
def token_func(text):
    en_text=[sentence['en'] for sentence in text['translation']]
    fr_text=[sentence['fr'] for sentence in text['translation']]
    text_token=tokenizer(
        en_text, 
        text_target=fr_text,
        max_length=128,
        truncation=True
        )
    return text_token

In [13]:
data_ecd=data.map(token_func, batched=True)

Map:   0%|          | 0/189155 [00:00<?, ? examples/s]

Map:   0%|          | 0/21018 [00:00<?, ? examples/s]

In [14]:
data_ecd

DatasetDict({
    train: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['id', 'translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

In [15]:
data_input=data_ecd.remove_columns(['translation', 'id'])
#data_input=data_input.with_format('torch')

In [16]:
data_input

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 189155
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21018
    })
})

Model

In [17]:
#Loading model
model=AutoModelForSeq2SeqLM.from_pretrained(check_point).to(device)

In [18]:
print(model)

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(59514, 512, padding_idx=59513)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(59514, 512, padding_idx=59513)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [19]:
#Using Lora to fine-tune model
from peft import LoraConfig, get_peft_model

In [20]:
#Configuration 
peft_config=LoraConfig(
    r=8,
    task_type='SEQ_2_SEQ_LM',
    inference_mode=False,    
    target_modules=['k_proj', 'v_proj', 'q_proj', 'out_proj']
)

In [22]:
peft_model=get_peft_model(model, peft_config).to(device)
peft_model.print_trainable_parameters()

trainable params: 589,824 || all params: 75,723,776 || trainable%: 0.7789


In [23]:
#Data collator
from transformers import DataCollatorForSeq2Seq
data_collator=DataCollatorForSeq2Seq(tokenizer, peft_model)

Metric

In [24]:
import evaluate
metric=evaluate.load('sacrebleu')

In [25]:
def compute_metrics(predict):
    preds, labels=predict
    preds_dcd=tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels=np.where(labels !=-100, labels, tokenizer.pad_token_id)
    labels_dcd=tokenizer.batch_decode(labels, skip_special_tokens=True)
    print(len(labels_dcd))
    print(len(preds_dcd))
    preds_dcd=[w.strip() for w in preds_dcd]
    labels_dcd=[[w.strip()] for w in labels_dcd]
    print(len(labels_dcd))
    print(len(preds_dcd))
    result=metric.compute(predictions=preds_dcd, references=labels_dcd)
    return{'Blue score': result['score']}



Training

In [26]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [27]:
batch_size=8
train_args=Seq2SeqTrainingArguments(
    output_dir='translate_Helsiki',
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch'   
)

In [None]:
trainer=Seq2SeqTrainer(
    peft_model,
    train_args,
    train_dataset=data_input['train'],
    eval_dataset=data_input['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

  trainer=Seq2SeqTrainer(


In [29]:
from safetensors.torch import load_model, save_model

In [30]:
trainer.train()

  0%|          | 0/70935 [00:00<?, ?it/s]

{'loss': 1.4861, 'grad_norm': 1.5121797323226929, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/2628 [00:00<?, ?it/s]

{'eval_loss': 1.2718814611434937, 'eval_runtime': 39.6491, 'eval_samples_per_second': 530.1, 'eval_steps_per_second': 66.281, 'epoch': 1.0}
{'loss': 1.3741, 'grad_norm': 3.527168035507202, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/2628 [00:00<?, ?it/s]

{'eval_loss': 1.229673147201538, 'eval_runtime': 39.1097, 'eval_samples_per_second': 537.411, 'eval_steps_per_second': 67.196, 'epoch': 2.0}
{'loss': 1.3427, 'grad_norm': 3.540693998336792, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/2628 [00:00<?, ?it/s]

{'eval_loss': 1.2161872386932373, 'eval_runtime': 39.7111, 'eval_samples_per_second': 529.273, 'eval_steps_per_second': 66.178, 'epoch': 3.0}
{'train_runtime': 2855.7013, 'train_samples_per_second': 198.713, 'train_steps_per_second': 24.84, 'train_loss': 1.4009786137837457, 'epoch': 3.0}


TrainOutput(global_step=70935, training_loss=1.4009786137837457, metrics={'train_runtime': 2855.7013, 'train_samples_per_second': 198.713, 'train_steps_per_second': 24.84, 'total_flos': 6124534889840640.0, 'train_loss': 1.4009786137837457, 'epoch': 3.0})

Saving and loading model

In [32]:
peft_model.save_pretrained('./translation_peft')

In [4]:
from transformers import pipeline

In [5]:
pipe=pipeline('translation', model='translation_peft', device=device)



In [6]:
pipe('we watching a film')

[{'translation_text': 'nous regardons un film'}]