# Install library

In [None]:
!pip install transformers datasets sentencepiece bert-score sacrebleu sacremoses

# Load dataset for en-vi from kde4

In [2]:
from datasets import load_dataset
#Load dataset from kde4 for en-vi
data = load_dataset('kde4', lang1='en', lang2='vi')
data



  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 42782
    })
})

In [3]:
# Get 5000 samples
small = data['train'].shuffle(seed=42).select(range(5000))



In [4]:
# Split train test dataset
ds = small.train_test_split(seed=42)
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 3750
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1250
    })
})

In [5]:
ds['train'][0]

{'id': '20684', 'translation': {'en': 'Allylisothiocyanate', 'vi': 'Name'}}

In [6]:
import numpy as np

train = ds['train']['translation']
# Get max_input_len > 95% 
input_lens = [len(tr['en']) for tr in train]
max_input_len = int(np.percentile(input_lens,95))
max_input_len

103

In [7]:
# Get max_target_len > 95% 
target_lens = [len(tr['vi']) for tr in train]
max_target_len = int(np.percentile(target_lens,95))
max_target_len

93

In [8]:
from transformers import AutoTokenizer

#Load tokenizer form pretrained model
checkpoint = 'Helsinki-NLP/opus-mt-en-vi'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
#Define tokenizer function
def tokenizer_fn(batch):
  #List text for en
  inputs = [x['en'] for x in batch['translation']]
  #List text for vi
  targets = [x['vi'] for x in batch['translation']]

  #Tokenizer with max_input_len and max_target_len
  tokenized_inputs = tokenizer(inputs, max_length=max_input_len, truncation=True)
  tokenized_targets = tokenizer(text_target=targets, max_length=max_target_len, truncation=True)

  #Add a labels column that matches the requirements transformers library
  tokenized_inputs['labels'] = tokenized_targets['input_ids']
  return tokenized_inputs

In [10]:
#Apply tokenizer for en_text and vi_text
tokenized_ds = ds.map(
    tokenizer_fn,
    batched=True,
    #Remove unnecessary columns
    remove_columns=ds['train'].column_names
)

Map:   0%|          | 0/3750 [00:00<?, ? examples/s]

Map:   0%|          | 0/1250 [00:00<?, ? examples/s]

In [11]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3750
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1250
    })
})

In [12]:
from transformers import AutoModelForSeq2SeqLM

#Download pretrained model from checkpoint
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [13]:
from transformers import DataCollatorForSeq2Seq

#Create data_collator for convert torch tensor
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [14]:
from datasets import load_metric
#Load and test bert_metric
bert_metric = load_metric('bertscore')
bert_metric.compute(
    predictions=['Tôi đang học về dịch máy'],
    references=[['Tôi đã học dịch máy']],
    lang='vi'
)

  bert_metric = load_metric('bertscore')


{'precision': [0.9314209222793579],
 'recall': [0.9498319625854492],
 'f1': [0.940536379814148],
 'hashcode': 'bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.26.1)'}

In [15]:
#Load and test bleu_metric
bleu_metric = load_metric('sacrebleu')
bleu_metric.compute(
    predictions=['Tôi đang học về dịch máy'],
    references=[['Tôi đã học dịch máy']]
)

{'score': 19.304869754804482,
 'counts': [4, 1, 0, 0],
 'totals': [6, 5, 4, 3],
 'precisions': [66.66666666666667, 20.0, 12.5, 8.333333333333334],
 'bp': 1.0,
 'sys_len': 6,
 'ref_len': 5}

In [16]:
#Define compute_metrics function for evaluation
def compute_metrics(preds_and_labels):
  preds, labels = preds_and_labels

  # Convert predict to words
  decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

  #Replace any -100 label with pad_token_id
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

  #Convert labels to words
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  #Remove double leading whitespace
  #Put labels in the list that match the metrics' requirements
  decoded_preds = [pred.strip() for pred in decoded_preds]
  decoded_labels = [[label.strip()] for label in decoded_labels] 
  
  bleu_score = bleu_metric.compute(
      predictions=decoded_preds,
      references=decoded_labels
  )

  bert_score = bert_metric.compute(
      predictions=decoded_preds,
      references=decoded_labels,
      lang='vi'
  )
  return {
      'bleu': bleu_score['score'],
      'bert_score': np.mean(bert_score['f1'])
  }

In [17]:
from transformers import Seq2SeqTrainingArguments

checkpoint_path = '/content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/'
training_args = Seq2SeqTrainingArguments(
    checkpoint_path,
    evaluation_strategy='no', #No evaluation during training
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_eval_batch_size=64,
    per_device_train_batch_size=32,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True, #During evaluation, generate predictions without knowing true targets fed into decoder
    fp16=True, #Use float16 instead of float32
)

In [18]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model, 
    training_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'], 
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using cuda_amp half precision backend


In [19]:
#evaluate the model before training
trainer.evaluate(max_length=max_target_len)

***** Running Evaluation *****
  Num examples = 1250
  Batch size = 64
Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_t

{'eval_loss': 4.187745094299316,
 'eval_bleu': 6.05495734150977,
 'eval_bert_score': 0.6856014505863189,
 'eval_runtime': 73.147,
 'eval_samples_per_second': 17.089,
 'eval_steps_per_second': 0.273}

In [20]:
trainer.train()

***** Running training *****
  Num examples = 3750
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 354
  Number of trainable parameters = 71625216


Step,Training Loss


Saving model checkpoint to /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118/config.json
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118/generation_config.json
Model weights saved in /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-118/special_tokens_map.json
Deleting older checkpoint [/content/drive/MyDrive/udemy_course/saved_model/translation/checkpoint/checkpoint-24] due to args.save_total_limit
Saving model checkpoint to /content/drive/MyDrive/udemy_course/saved_model/translation/

TrainOutput(global_step=354, training_loss=1.8706511589093398, metrics={'train_runtime': 71.0274, 'train_samples_per_second': 158.39, 'train_steps_per_second': 4.984, 'total_flos': 155465846489088.0, 'train_loss': 1.8706511589093398, 'epoch': 3.0})

In [21]:
#evaluate the model after training, and the model has been improved 
trainer.evaluate(max_length=max_target_len)

***** Running Evaluation *****
  Num examples = 1250
  Batch size = 64
Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}



Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 53684,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 4,
  "pad_token_id": 53684,
  "transformers_version": "4.26.1"
}

Generate config GenerationConfig {
  "bad_words_ids": [
    [
      53684
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_t

{'eval_loss': 1.4573534727096558,
 'eval_bleu': 48.64683606181376,
 'eval_bert_score': 0.8613185623168945,
 'eval_runtime': 49.0241,
 'eval_samples_per_second': 25.498,
 'eval_steps_per_second': 0.408,
 'epoch': 3.0}

In [22]:
save_path = '/content/drive/MyDrive/udemy_course/saved_model/translation/'
trainer.save_model(save_path)

Saving model checkpoint to /content/drive/MyDrive/udemy_course/saved_model/translation/
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/translation/config.json
Configuration saved in /content/drive/MyDrive/udemy_course/saved_model/translation/generation_config.json
Model weights saved in /content/drive/MyDrive/udemy_course/saved_model/translation/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/udemy_course/saved_model/translation/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/udemy_course/saved_model/translation/special_tokens_map.json


In [3]:
from transformers import pipeline
#Use fine tune model 
save_path = '/content/drive/MyDrive/udemy_course/saved_model/translation/'
translator = pipeline('translation', model=save_path)

In [4]:
translator("Always remember us this way")

[{'translation_text': '♪ Hãy luôn nhớ đến chúng con theo cách này'}]

In [5]:
translator("you are the reason")

[{'translation_text': 'bạn là lý do'}]

In [6]:
translator("waiting for love")

[{'translation_text': '♪ chờ đợi tình yêu ♪'}]

In [8]:
translator('Do you understand?')

[{'translation_text': 'Hiểu chưa?'}]

In [9]:
translator("Can you speak english?")

[{'translation_text': 'Anh nói được tiếng Anh không?'}]