<a href="https://colab.research.google.com/github/sarath-nookala/SLT/blob/main/Slt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!git clone https://github.com/sarath-nookala/SLT.git

Cloning into 'SLT'...
remote: Enumerating objects: 155, done.[K
remote: Counting objects: 100% (155/155), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 155 (delta 47), reused 143 (delta 38), pack-reused 0[K
Receiving objects: 100% (155/155), 14.17 MiB | 14.54 MiB/s, done.
Resolving deltas: 100% (47/47), done.


In [2]:
%cd drive/MyDrive/SLT

/content/drive/MyDrive/SLT


In [3]:
!git checkout mixed-finetuning

M	data/dev_data.json
M	data/test_data.json
M	data/train_data.json
M	result/all_results.json
M	result/config.json
M	result/generated_predictions.txt
M	result/predict_results.json
M	result/runs/Nov07_04-59-38_2b4c697760fa/events.out.tfevents.1667797216.2b4c697760fa.450.0
M	result/special_tokens_map.json
M	result/tokenizer.json
M	result/tokenizer_config.json
M	result/train_results.json
M	result/training_args.bin
M	utils/jsonize.py
Already on 'mixed-finetuning'
Your branch is up to date with 'origin/mixed-finetuning'.


In [4]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import numpy as np
from datasets import load_dataset

import evaluate
import transformers
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    M2M100Tokenizer,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    default_data_collator,
    set_seed,
)

from transformers.models.mbart.modeling_mbart import MBartModel, MBartForConditionalGeneration
from transformers import MBartConfig, AutoConfig, AutoModelForSeq2SeqLM

from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version



Jsonize both NCSLGR and ASLG data

In [6]:
!python utils/jsonize.py --dataset ncslgr --mode train dev test --src gloss.asl --tgt en

In [7]:
!python utils/jsonize.py --dataset aslg --mode train dev test --src gloss.asl --tgt en

In [8]:
logger = logging.getLogger(__name__)

# A list of all multilingual tokenizer which require src_lang and tgt_lang attributes.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast, M2M100Tokenizer]


@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    """

    source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
    target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})
    vocab_size: int = field(default=None, metadata={"help": "vocab size for training custom tokenizer"})

    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."})
    validation_file: Optional[str] = field(
        default=None,
        metadata={
            "help": "An optional input evaluation data file to evaluate the metrics (sacrebleu) on a jsonlines file."
        },
    )
    test_file: Optional[str] = field(
        default=None,
        metadata={"help": "An optional input test data file to evaluate the metrics (sacrebleu) on a jsonlines file."},
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
    )
    preprocessing_num_workers: Optional[int] = field(
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    max_target_length: Optional[int] = field(
        default=128,
        metadata={
            "help": (
                "The maximum total sequence length for target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    val_max_target_length: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded. Will default to `max_target_length`."
                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
                "during ``evaluate`` and ``predict``."
            )
        },
    )
    pad_to_max_length: bool = field(
        default=False,
        metadata={
            "help": (
                "Whether to pad all samples to model maximum sentence length. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
                "efficient on GPU but very bad for TPU."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    num_beams: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
                "which is used during ``evaluate`` and ``predict``."
            )
        },
    )
    ignore_pad_token_for_loss: bool = field(
        default=True,
        metadata={
            "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
        },
    )
    source_prefix: Optional[str] = field(
        default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
    )
    forced_bos_token: Optional[str] = field(
        default=None,
        metadata={
            "help": (
                "The token to force as the first generated token after the :obj:`decoder_start_token_id`.Useful for"
                " multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token needs to"
                " be the target language token.(Usually it is the target language token)"
            )
        },
    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
            raise ValueError("Need either a dataset name or a training/validation file.")
        elif self.source_lang is None or self.target_lang is None:
            raise ValueError("Need to specify the source language and the target language.")

        # accepting both json and jsonl file extensions, as
        # many jsonlines files actually have a .json extension
        valid_extensions = ["json", "jsonl"]

        if self.train_file is not None:
            extension = self.train_file.split(".")[-1]
            assert extension in valid_extensions, "`train_file` should be a jsonlines file."
        if self.validation_file is not None:
            extension = self.validation_file.split(".")[-1]
            assert extension in valid_extensions, "`validation_file` should be a jsonlines file."
        if self.val_max_target_length is None:
            self.val_max_target_length = self.max_target_length

parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))


In [9]:
model_args, data_args, training_args = parser.parse_args_into_dataclasses([
    "--train_file", "./data/aslg_train_data.json",
    "--validation_file", "./data/aslg_dev_data.json",
    "--test_file", "./data/aslg_test_data.json",
    "--model_name_or_path", "facebook/mbart-large-cc25", 
    "--output_dir" , "/output",
    "--source_lang", "en",
    "--target_lang", "gloss.asl",
    "--vocab_size", "20000",
    "--max_source_length", "50",
    "--max_target_length", "50",
    "--ignore_pad_token_for_loss", "True",
    "--output_dir","./result",
    "--generation_max_length", "50",
    "--generation_num_beams", "1",
    "--predict_with_generate", "True",
    "--per_device_train_batch_size", "8",
    "--per_device_eval_batch_size", "8",
    "--num_train_epochs", "50",
    "--learning_rate", "1e-5",
    "--do_train", "True",
    "--do_eval", "True",
    "--do_predict", "True",
    "--save_strategy", "no",
    "--logging_steps", "100",
    ])

In [10]:
extension = "json"
aslg_train_dataset = load_dataset( extension, data_files = './data/aslg_train_data.json', split = "train" )
aslg_validation_dataset = load_dataset( extension, data_files = './data/aslg_dev_data.json', split = "train" )
aslg_test_dataset = load_dataset( extension, data_files = './data/aslg_test_data.json', split = "train" )

ncsl_train_dataset = load_dataset( extension, data_files = './data/ncslgr_train_data.json', split = "train" )
ncsl_validation_dataset = load_dataset( extension, data_files = './data/ncslgr_dev_data.json', split = "train" )
ncsl_test_dataset = load_dataset( extension, data_files = './data/ncslgr_test_data.json', split = "train" )



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-9a18443046577038/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-9a18443046577038/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-ebf9084ff4dcc340/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-ebf9084ff4dcc340/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-94bf0082a45a13e0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-94bf0082a45a13e0/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-e723fceab94b14ed/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-e723fceab94b14ed/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-c519dce7c3c881b3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-c519dce7c3c881b3/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.




Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-c1a36b6b4f0737f4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-c1a36b6b4f0737f4/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


In [11]:
# Combine aslg and ncslgr dataset for tokenizer
# We repeat this later after tokenizer is trained and we preprocess the dataset
from datasets import concatenate_datasets
combined_train_dataset = concatenate_datasets([aslg_train_dataset, ncsl_train_dataset])

In [12]:
def get_training_corpus():
    return (
        [ p for k in combined_train_dataset[i: i+1000]["translation"] for p in list(k.values()) ]
        for i in range(0, len(combined_train_dataset), 1000)
    )
corpus = get_training_corpus()

# train custom tokenizer 

In [13]:
from transformers import MBartTokenizer
old_tokenizer = MBartTokenizerFast.from_pretrained("facebook/mbart-large-en-ro", src_lang = "en_XX", tgt_lang = "ro_RO")

In [14]:
# train the tokenizer from scratch using the dump
tokenizer = old_tokenizer.train_new_from_iterator(corpus, data_args.vocab_size, new_special_tokens = ["gl_EN", "<ood>", "<id>"])
tokenizer.tgt_lang = "en"
tokenizer.lang_code_to_id["en"] = tokenizer.convert_tokens_to_ids("en")

In [15]:
# testing the tokenizer

tgt_text = ncsl_train_dataset[0]["translation"]["gloss.asl"]
print(tgt_text)
labels = tokenizer(text_target = tgt_text, max_length = 10, padding = "max_length", return_tensors="pt").input_ids
print(tokenizer.convert_ids_to_tokens(list(labels[0])))


fs-JOHN TELL:i fs-MARY IX-1p BUY HOUSE .
['▁fs-JOHN', '▁TELL:', 'i', '▁fs-MARY', '▁IX-1p', '▁BUY', '▁HOUSE', '▁', '.', '</s>', 'en']


In [16]:
padding = False
prefix = data_args.source_prefix if data_args.source_prefix is not None else ""

def preprocess_function_ood(examples):
    inputs = [ex[data_args.source_lang] for ex in examples["translation"]]
    targets = [ex[data_args.target_lang] for ex in examples["translation"]]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
    for i in range(len(model_inputs["input_ids"])):
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:-1] + tokenizer.convert_tokens_to_ids(["<ood>"]) + [model_inputs["input_ids"][i][-1]]
        model_inputs["attention_mask"][i].append(1)

    # Tokenize targets with the `text_target` keyword argument
    # labels = tokenizer(targets, max_length=data_args.max_target_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=targets, max_length=data_args.max_target_length, padding=padding, truncation=True)
    for i in range(len(labels["input_ids"])):
        labels["input_ids"][i] = labels["input_ids"][i][:-1] + tokenizer.convert_tokens_to_ids(["<ood>"]) + [labels["input_ids"][i][-1]]

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

column_names = aslg_train_dataset.column_names

In [17]:
aslg_train_dataset = aslg_train_dataset.map(
                preprocess_function_ood,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )

aslg_eval_dataset = aslg_validation_dataset.map(
                preprocess_function_ood,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on eval dataset",
            )
aslg_test_dataset = aslg_test_dataset.map(
                preprocess_function_ood,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on test dataset",
            )

Running tokenizer on train dataset:   0%|          | 0/83 [00:00<?, ?ba/s]

Running tokenizer on eval dataset:   0%|          | 0/4 [00:00<?, ?ba/s]

Running tokenizer on test dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [18]:
def preprocess_function_id(examples):
    inputs = [ex[data_args.source_lang] for ex in examples["translation"]]
    targets = [ex[data_args.target_lang] for ex in examples["translation"]]
    inputs = [prefix + inp for inp in inputs]
    model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
    for i in range(len(model_inputs["input_ids"])):
        model_inputs["input_ids"][i] = model_inputs["input_ids"][i][:-1] + tokenizer.convert_tokens_to_ids(["<id>"]) + [model_inputs["input_ids"][i][-1]]
        model_inputs["attention_mask"][i].append(1)

    # Tokenize targets with the `text_target` keyword argument
    # labels = tokenizer(targets, max_length=data_args.max_target_length, padding=padding, truncation=True)
    labels = tokenizer(text_target=targets, max_length=data_args.max_target_length, padding=padding, truncation=True)
    for i in range(len(labels["input_ids"])):
        labels["input_ids"][i] = labels["input_ids"][i][:-1] + tokenizer.convert_tokens_to_ids(["<id>"]) + [labels["input_ids"][i][-1]]

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length" and data_args.ignore_pad_token_for_loss:
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
ncsl_train_dataset = ncsl_train_dataset.map(
                preprocess_function_id,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on train dataset",
            )

ncsl_eval_dataset = ncsl_validation_dataset.map(
                preprocess_function_id,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on eval dataset",
            )
ncsl_test_dataset = ncsl_test_dataset.map(
                preprocess_function_id,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
                load_from_cache_file=not data_args.overwrite_cache,
                desc="Running tokenizer on test dataset",
            )

Running tokenizer on train dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on eval dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

Running tokenizer on test dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
# Combined dataset
# this is where you can set the proportion of each dataset to combine
combined_train_dataset = concatenate_datasets([aslg_train_dataset, ncsl_train_dataset])
combined_eval_dataset = concatenate_datasets([aslg_eval_dataset, ncsl_eval_dataset])
combined_test_dataset = concatenate_datasets([aslg_test_dataset, ncsl_test_dataset])

In [21]:
ids = list(combined_train_dataset[0]["input_ids"])
print(tokenizer.convert_ids_to_tokens(ids))

label_ids = list(combined_train_dataset[0]["labels"])
print(tokenizer.convert_ids_to_tokens(label_ids))
combined_train_dataset[0]


['▁membership', '▁of', '▁parliament', '▁see', '▁minutes', '</s>', '<ood>', 'en_XX']
['▁membership', '▁parliament', '▁see', '▁minute', '</s>', '<ood>', 'en']


{'input_ids': [922, 51, 96, 106, 184, 2, 31, 7],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [922, 96, 106, 153, 2, 31, 1221]}

In [22]:
# set_seed(training_args.seed)

# EMBEDDING_DIM = 512
# SCALE_DOWN_FACTOR = 4

# config = MBartConfig()

# config.d_model = EMBEDDING_DIM
# config.vocab_size = data_args.vocab_size
# config.encoder_attention_heads //= SCALE_DOWN_FACTOR
# config.encoder_ffn_dim //= SCALE_DOWN_FACTOR
# config.encoder_layers //= SCALE_DOWN_FACTOR
# config.decoder_attention_heads //= SCALE_DOWN_FACTOR
# config.decoder_ffn_dim //= SCALE_DOWN_FACTOR
# config.decoder_layers //= SCALE_DOWN_FACTOR
# print(config)

In [23]:
config = AutoConfig.from_pretrained(
    model_args.config_name if model_args.config_name else model_args.model_name_or_path,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_args.model_name_or_path,
    from_tf=bool(".ckpt" in model_args.model_name_or_path),
    config=config,
    cache_dir=model_args.cache_dir,
    revision=model_args.model_revision,
    use_auth_token=True if model_args.use_auth_token else None,
)
# config = MBartConfig()
# model = MBartForConditionalGeneration(config)
model.resize_token_embeddings(len(tokenizer))


Embedding(19669, 1024)

In [24]:
model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids("gl_EN")

In [25]:
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8 if training_args.fp16 else None,
)


In [26]:
# load metrics

# evaluate.list_evaluation_modules()

sacrebleu = evaluate.load("sacrebleu")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    if data_args.ignore_pad_token_for_loss:
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = {}
    
    result["sacrebleu"] = sacrebleu.compute(predictions=decoded_preds, references=decoded_labels)
    result["bleu_1"] = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order = 1)
    result["bleu_2"] = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order = 2)
    result["bleu_3"] = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order = 3)
    result["bleu_4"] = bleu.compute(predictions=decoded_preds, references=decoded_labels, max_order = 4)

    result["rouge"] = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    # result = {k: round(v, 4) for k, v in result.items()}
    return result



Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [28]:
training_args.max_steps = -1
training_args.num_train_epochs = 20


# Initialize our Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=combined_train_dataset if training_args.do_train else None,
    eval_dataset=combined_eval_dataset if training_args.do_eval else None,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics if training_args.predict_with_generate else None,
)


In [29]:
train_result = trainer.train()

***** Running training *****
  Num examples = 83710
  Num Epochs = 20
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 209280
  Number of trainable parameters = 374965248
You're using a MBartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


KeyboardInterrupt: ignored

In [None]:
print(train_result)

TrainOutput(global_step=2760, training_loss=3.4731022157530855, metrics={'train_runtime': 1041.6884, 'train_samples_per_second': 21.12, 'train_steps_per_second': 2.65, 'total_flos': 1398179641098240.0, 'train_loss': 3.4731022157530855, 'epoch': 20.0})


In [None]:
trainer.save_model()  
trainer.save_metrics("train", train_result.metrics)


Saving model checkpoint to ./result
Configuration saved in ./result/config.json
Model weights saved in ./result/pytorch_model.bin
tokenizer config file saved in ./result/tokenizer_config.json
Special tokens file saved in ./result/special_tokens_map.json


In [None]:
predict_results = trainer.predict(
            test_dataset, metric_key_prefix="predict", max_length=training_args.generation_max_length, num_beams=training_args.generation_num_beams
        )

trainer.save_metrics("predict", predict_results.metrics)

if training_args.predict_with_generate:
    predictions = tokenizer.batch_decode(
        predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
    )
    predictions = [pred.strip() for pred in predictions]
    output_prediction_file = os.path.join(training_args.output_dir, "generated_predictions.txt")
    with open(output_prediction_file, "w", encoding="utf-8") as writer:
        writer.write("\n".join(predictions))


***** Running Prediction *****
  Num examples = 725
  Batch size = 8


In [None]:
print(predict_results)

# for 

PredictionOutput(predictions=array([[  30, 1202,   40, ...,    1,    1,    1],
       [  30, 1083,   33, ...,    1,    1,    1],
       [  30, 1083,   33, ...,    1,    1,    1],
       ...,
       [  30,   33, 3647, ...,    1,    1,    1],
       [  30, 1202,   40, ...,    1,    1,    1],
       [  30,   33, 6701, ...,    1,    1,    1]]), label_ids=array([[ 3119, 17514,  4720, ...,  -100,  -100,  -100],
       [ 1083,  4051,  4492, ...,  -100,  -100,  -100],
       [ 1083,  6557,  8534, ...,    33,     2,    30],
       ...,
       [  352,   941,    33, ...,     1,     1,     1],
       [   33,   678,    38, ...,     1,     1,     1],
       [   33,    40,    38, ...,     1,     1,     1]]), metrics={'predict_loss': 4.366248607635498, 'predict_sacrebleu': {'score': 6.678097128839471, 'counts': [2337, 711, 337, 166], 'totals': [7008, 6283, 5559, 4836], 'precisions': [33.34760273972603, 11.316250198949547, 6.062241410325598, 3.4325889164598844], 'bp': 0.7094098031488547, 'sys_len': 700