In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
import torchvision.transforms as transforms
import json
import os
import re
import datasets
import numpy as np
from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTModel, ViTImageProcessor, ViTFeatureExtractor, AutoModelForCausalLM, AutoTokenizer
import wandb
from PIL import Image
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
# raw_train_dataset = load_from_disk("processed_train")
# raw_val_dataset = load_from_disk("processed_val")

In [4]:
# train_dataset = raw_val_dataset.select(range(1000))
# val_dataset = raw_val_dataset.select(range(100,1125))

# ds = DatasetDict({'train':train_dataset, 'val':val_dataset})
# ds

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2", padding_side='left')
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# ViT =  ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
model = AutoModelForCausalLM.from_pretrained("gpt2", max_length=256,)



In [6]:
# model.config.decoder_start_token_id = tokenizer.cls_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = tokenizer.eos_token_id
# model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 256
model.generation_config.max_length = 256
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

model.generation_config.pad_token_id = model.generation_config.eos_token_id

model.generation_config

GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "max_length": 256,
  "pad_token_id": 50256
}

In [7]:

def tokenization_fn(captions, max_target_length):
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length, truncation=True).input_ids

    return labels

def feature_extraction_fn(images):
    encoder_inputs = feature_extractor(images=images, return_tensors="pt")
    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = True):
    """Run tokenization + image feature extraction"""
    image_paths = examples['raw_image']
    captions = examples['caption']
    
    model_inputs = {}
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['input_ids'] = model_inputs['labels']
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths)

    return model_inputs

In [8]:
# processed_dataset = ds.map(
#     function=preprocess_fn,
#     batched=True,
#     fn_kwargs={"max_target_length": 128},
#     remove_columns=ds['train'].column_names
# )

# processed_dataset['val']

processed_dataset = load_from_disk('processed_dataset')

In [9]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    generation_max_length=256,
    generation_config=model.generation_config,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)


In [10]:
import evaluate
metric = evaluate.load("rouge")

In [11]:
import nltk

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    preds = [pred[0].tolist() for pred in preds]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    assert len(decoded_preds) == len(decoded_labels)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [12]:
from transformers import default_data_collator, DataCollatorForSeq2Seq
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['val'],
    data_collator=default_data_collator,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
wandb.init(project='LLM_Project_ablation')

trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvaradnev[0m ([33mvarnevnlp2023[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/88716 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.334, 'grad_norm': 1.51202392578125, 'learning_rate': 4.9718201902700756e-05, 'epoch': 0.02}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2897, 'grad_norm': 0.694603443145752, 'learning_rate': 4.943640380540151e-05, 'epoch': 0.03}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2752, 'grad_norm': 0.6402120590209961, 'learning_rate': 4.915460570810226e-05, 'epoch': 0.05}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2692, 'grad_norm': 0.652636706829071, 'learning_rate': 4.887280761080301e-05, 'epoch': 0.07}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2702, 'grad_norm': 0.6179815530776978, 'learning_rate': 4.8591009513503764e-05, 'epoch': 0.08}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2631, 'grad_norm': 0.6116506457328796, 'learning_rate': 4.830921141620452e-05, 'epoch': 0.1}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2613, 'grad_norm': 0.5959474444389343, 'learning_rate': 4.802741331890527e-05, 'epoch': 0.12}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2625, 'grad_norm': 0.5918656587600708, 'learning_rate': 4.7745615221606025e-05, 'epoch': 0.14}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2623, 'grad_norm': 0.6003735661506653, 'learning_rate': 4.746381712430678e-05, 'epoch': 0.15}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2635, 'grad_norm': 0.47293856739997864, 'learning_rate': 4.718201902700753e-05, 'epoch': 0.17}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2584, 'grad_norm': 0.47861066460609436, 'learning_rate': 4.6900220929708286e-05, 'epoch': 0.19}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2569, 'grad_norm': 0.6251765489578247, 'learning_rate': 4.661842283240904e-05, 'epoch': 0.2}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2601, 'grad_norm': 0.580597460269928, 'learning_rate': 4.633662473510979e-05, 'epoch': 0.22}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.254, 'grad_norm': 0.3853512704372406, 'learning_rate': 4.605482663781055e-05, 'epoch': 0.24}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2554, 'grad_norm': 0.5651569962501526, 'learning_rate': 4.57730285405113e-05, 'epoch': 0.25}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.254, 'grad_norm': 0.5476236343383789, 'learning_rate': 4.5491230443212054e-05, 'epoch': 0.27}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2513, 'grad_norm': 0.4329461455345154, 'learning_rate': 4.52094323459128e-05, 'epoch': 0.29}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2528, 'grad_norm': 0.435062438249588, 'learning_rate': 4.4927634248613555e-05, 'epoch': 0.3}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2504, 'grad_norm': 0.4322252869606018, 'learning_rate': 4.464583615131431e-05, 'epoch': 0.32}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2518, 'grad_norm': 0.45583376288414, 'learning_rate': 4.436403805401506e-05, 'epoch': 0.34}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2481, 'grad_norm': 0.43249574303627014, 'learning_rate': 4.4082239956715816e-05, 'epoch': 0.36}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2497, 'grad_norm': 0.39746347069740295, 'learning_rate': 4.380044185941657e-05, 'epoch': 0.37}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2483, 'grad_norm': 0.4532255232334137, 'learning_rate': 4.351864376211732e-05, 'epoch': 0.39}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.246, 'grad_norm': 0.384235680103302, 'learning_rate': 4.323684566481807e-05, 'epoch': 0.41}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2454, 'grad_norm': 0.40964746475219727, 'learning_rate': 4.2955047567518824e-05, 'epoch': 0.42}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2522, 'grad_norm': 0.37288370728492737, 'learning_rate': 4.267324947021958e-05, 'epoch': 0.44}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2491, 'grad_norm': 0.3800268769264221, 'learning_rate': 4.239145137292033e-05, 'epoch': 0.46}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2476, 'grad_norm': 0.3932889699935913, 'learning_rate': 4.2109653275621085e-05, 'epoch': 0.47}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2439, 'grad_norm': 0.44564661383628845, 'learning_rate': 4.182785517832184e-05, 'epoch': 0.49}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2435, 'grad_norm': 0.4757470190525055, 'learning_rate': 4.154605708102259e-05, 'epoch': 0.51}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2462, 'grad_norm': 0.30427345633506775, 'learning_rate': 4.126425898372334e-05, 'epoch': 0.52}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2453, 'grad_norm': 0.4990825355052948, 'learning_rate': 4.098246088642409e-05, 'epoch': 0.54}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2489, 'grad_norm': 0.42568185925483704, 'learning_rate': 4.0700662789124846e-05, 'epoch': 0.56}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2456, 'grad_norm': 0.39760255813598633, 'learning_rate': 4.04188646918256e-05, 'epoch': 0.57}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2505, 'grad_norm': 0.45960167050361633, 'learning_rate': 4.013706659452636e-05, 'epoch': 0.59}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2415, 'grad_norm': 0.33797264099121094, 'learning_rate': 3.9855268497227114e-05, 'epoch': 0.61}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2474, 'grad_norm': 0.3995760381221771, 'learning_rate': 3.957347039992786e-05, 'epoch': 0.63}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2403, 'grad_norm': 0.46203839778900146, 'learning_rate': 3.9291672302628614e-05, 'epoch': 0.64}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.244, 'grad_norm': 0.4022079408168793, 'learning_rate': 3.900987420532937e-05, 'epoch': 0.66}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2417, 'grad_norm': 0.4208087623119354, 'learning_rate': 3.872807610803012e-05, 'epoch': 0.68}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2445, 'grad_norm': 0.43617910146713257, 'learning_rate': 3.8446278010730875e-05, 'epoch': 0.69}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2454, 'grad_norm': 0.44304871559143066, 'learning_rate': 3.816447991343163e-05, 'epoch': 0.71}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2407, 'grad_norm': 0.40547314286231995, 'learning_rate': 3.788268181613238e-05, 'epoch': 0.73}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2437, 'grad_norm': 0.4034676253795624, 'learning_rate': 3.760088371883313e-05, 'epoch': 0.74}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2438, 'grad_norm': 0.46169233322143555, 'learning_rate': 3.731908562153388e-05, 'epoch': 0.76}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2406, 'grad_norm': 0.3211038112640381, 'learning_rate': 3.703728752423464e-05, 'epoch': 0.78}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.244, 'grad_norm': 0.46166688203811646, 'learning_rate': 3.675548942693539e-05, 'epoch': 0.79}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2469, 'grad_norm': 0.3119868040084839, 'learning_rate': 3.6473691329636144e-05, 'epoch': 0.81}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2432, 'grad_norm': 0.32344362139701843, 'learning_rate': 3.61918932323369e-05, 'epoch': 0.83}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2425, 'grad_norm': 0.3683955669403076, 'learning_rate': 3.591009513503765e-05, 'epoch': 0.85}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2454, 'grad_norm': 0.3412821292877197, 'learning_rate': 3.56282970377384e-05, 'epoch': 0.86}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2401, 'grad_norm': 0.3269411623477936, 'learning_rate': 3.534649894043915e-05, 'epoch': 0.88}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2388, 'grad_norm': 0.4371444880962372, 'learning_rate': 3.5064700843139906e-05, 'epoch': 0.9}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2426, 'grad_norm': 0.4036060869693756, 'learning_rate': 3.478290274584066e-05, 'epoch': 0.91}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2382, 'grad_norm': 0.4016161262989044, 'learning_rate': 3.450110464854141e-05, 'epoch': 0.93}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2411, 'grad_norm': 0.4025871455669403, 'learning_rate': 3.421930655124217e-05, 'epoch': 0.95}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2423, 'grad_norm': 0.4140893518924713, 'learning_rate': 3.393750845394292e-05, 'epoch': 0.96}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2424, 'grad_norm': 0.3390989899635315, 'learning_rate': 3.3655710356643674e-05, 'epoch': 0.98}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2366, 'grad_norm': 0.32718178629875183, 'learning_rate': 3.337391225934443e-05, 'epoch': 1.0}


 33%|███▎      | 29571/88716 [30:53<59:03, 16.69it/s]  A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For c

{'eval_loss': 0.23463328182697296, 'eval_rouge1': 0.17395024220231126, 'eval_rouge2': 0.0, 'eval_rougeL': 0.17388894913056713, 'eval_rougeLsum': 0.17393498375680405, 'eval_runtime': 32.0789, 'eval_samples_per_second': 155.865, 'eval_steps_per_second': 38.966, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2242, 'grad_norm': 0.41730934381484985, 'learning_rate': 3.309211416204518e-05, 'epoch': 1.01}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.229, 'grad_norm': 0.3640475273132324, 'learning_rate': 3.2810316064745935e-05, 'epoch': 1.03}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2288, 'grad_norm': 0.374369740486145, 'learning_rate': 3.252851796744669e-05, 'epoch': 1.05}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2234, 'grad_norm': 0.4093916714191437, 'learning_rate': 3.224671987014744e-05, 'epoch': 1.07}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2272, 'grad_norm': 0.4027348458766937, 'learning_rate': 3.196492177284819e-05, 'epoch': 1.08}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2247, 'grad_norm': 0.37167224287986755, 'learning_rate': 3.168312367554894e-05, 'epoch': 1.1}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2304, 'grad_norm': 0.37992429733276367, 'learning_rate': 3.14013255782497e-05, 'epoch': 1.12}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2236, 'grad_norm': 0.4939647614955902, 'learning_rate': 3.111952748095045e-05, 'epoch': 1.13}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2239, 'grad_norm': 0.44794124364852905, 'learning_rate': 3.0837729383651204e-05, 'epoch': 1.15}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2302, 'grad_norm': 0.34549984335899353, 'learning_rate': 3.055593128635196e-05, 'epoch': 1.17}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2258, 'grad_norm': 0.37099456787109375, 'learning_rate': 3.027413318905271e-05, 'epoch': 1.18}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2292, 'grad_norm': 0.44849416613578796, 'learning_rate': 2.9992335091753458e-05, 'epoch': 1.2}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2244, 'grad_norm': 0.39254194498062134, 'learning_rate': 2.9710536994454212e-05, 'epoch': 1.22}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2279, 'grad_norm': 0.37666431069374084, 'learning_rate': 2.9428738897154965e-05, 'epoch': 1.23}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2242, 'grad_norm': 0.42245468497276306, 'learning_rate': 2.9146940799855723e-05, 'epoch': 1.25}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2284, 'grad_norm': 0.41021034121513367, 'learning_rate': 2.8865142702556476e-05, 'epoch': 1.27}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2272, 'grad_norm': 0.33301031589508057, 'learning_rate': 2.858334460525723e-05, 'epoch': 1.28}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2282, 'grad_norm': 0.3104897141456604, 'learning_rate': 2.8301546507957983e-05, 'epoch': 1.3}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2251, 'grad_norm': 0.43520528078079224, 'learning_rate': 2.801974841065873e-05, 'epoch': 1.32}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2268, 'grad_norm': 0.37354084849357605, 'learning_rate': 2.7737950313359484e-05, 'epoch': 1.34}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2255, 'grad_norm': 0.35817667841911316, 'learning_rate': 2.7456152216060238e-05, 'epoch': 1.35}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.23, 'grad_norm': 0.3587605655193329, 'learning_rate': 2.717435411876099e-05, 'epoch': 1.37}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2261, 'grad_norm': 0.47656288743019104, 'learning_rate': 2.6892556021461745e-05, 'epoch': 1.39}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2252, 'grad_norm': 0.36775675415992737, 'learning_rate': 2.66107579241625e-05, 'epoch': 1.4}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2311, 'grad_norm': 0.40742090344429016, 'learning_rate': 2.632895982686325e-05, 'epoch': 1.42}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2256, 'grad_norm': 0.3408295512199402, 'learning_rate': 2.6047161729564003e-05, 'epoch': 1.44}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2255, 'grad_norm': 0.4044894874095917, 'learning_rate': 2.5765363632264756e-05, 'epoch': 1.45}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2292, 'grad_norm': 0.43481603264808655, 'learning_rate': 2.548356553496551e-05, 'epoch': 1.47}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2244, 'grad_norm': 0.349351704120636, 'learning_rate': 2.5201767437666264e-05, 'epoch': 1.49}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2283, 'grad_norm': 0.5784035921096802, 'learning_rate': 2.4919969340367014e-05, 'epoch': 1.5}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2284, 'grad_norm': 0.36434733867645264, 'learning_rate': 2.4638171243067768e-05, 'epoch': 1.52}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2259, 'grad_norm': 0.4888034462928772, 'learning_rate': 2.435637314576852e-05, 'epoch': 1.54}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2245, 'grad_norm': 0.485443115234375, 'learning_rate': 2.4074575048469275e-05, 'epoch': 1.56}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2262, 'grad_norm': 0.4019465744495392, 'learning_rate': 2.3792776951170025e-05, 'epoch': 1.57}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2307, 'grad_norm': 0.4564876854419708, 'learning_rate': 2.351097885387078e-05, 'epoch': 1.59}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2274, 'grad_norm': 0.4021449387073517, 'learning_rate': 2.3229180756571532e-05, 'epoch': 1.61}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2226, 'grad_norm': 0.3300606906414032, 'learning_rate': 2.2947382659272286e-05, 'epoch': 1.62}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2219, 'grad_norm': 0.46909040212631226, 'learning_rate': 2.266558456197304e-05, 'epoch': 1.64}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2276, 'grad_norm': 0.3690221607685089, 'learning_rate': 2.2383786464673793e-05, 'epoch': 1.66}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2245, 'grad_norm': 0.5551469922065735, 'learning_rate': 2.2101988367374547e-05, 'epoch': 1.67}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2247, 'grad_norm': 0.40276092290878296, 'learning_rate': 2.1820190270075297e-05, 'epoch': 1.69}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2254, 'grad_norm': 0.3916257917881012, 'learning_rate': 2.153839217277605e-05, 'epoch': 1.71}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2286, 'grad_norm': 0.37971121072769165, 'learning_rate': 2.1256594075476805e-05, 'epoch': 1.72}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2262, 'grad_norm': 0.41936662793159485, 'learning_rate': 2.0974795978177555e-05, 'epoch': 1.74}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2245, 'grad_norm': 0.4087435305118561, 'learning_rate': 2.069299788087831e-05, 'epoch': 1.76}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.227, 'grad_norm': 0.5023881196975708, 'learning_rate': 2.0411199783579062e-05, 'epoch': 1.78}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2279, 'grad_norm': 0.33792251348495483, 'learning_rate': 2.0129401686279813e-05, 'epoch': 1.79}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2238, 'grad_norm': 0.4308655560016632, 'learning_rate': 1.9847603588980566e-05, 'epoch': 1.81}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2258, 'grad_norm': 0.46775364875793457, 'learning_rate': 1.9565805491681323e-05, 'epoch': 1.83}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2234, 'grad_norm': 0.39716625213623047, 'learning_rate': 1.9284007394382077e-05, 'epoch': 1.84}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2258, 'grad_norm': 0.35690054297447205, 'learning_rate': 1.9002209297082827e-05, 'epoch': 1.86}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2226, 'grad_norm': 0.41278406977653503, 'learning_rate': 1.872041119978358e-05, 'epoch': 1.88}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2228, 'grad_norm': 0.44593700766563416, 'learning_rate': 1.8438613102484335e-05, 'epoch': 1.89}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2238, 'grad_norm': 0.5021536350250244, 'learning_rate': 1.8156815005185085e-05, 'epoch': 1.91}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.226, 'grad_norm': 0.49372398853302, 'learning_rate': 1.787501690788584e-05, 'epoch': 1.93}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2255, 'grad_norm': 0.5439066290855408, 'learning_rate': 1.7593218810586592e-05, 'epoch': 1.94}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.229, 'grad_norm': 0.3468192219734192, 'learning_rate': 1.7311420713287342e-05, 'epoch': 1.96}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2258, 'grad_norm': 0.2945147752761841, 'learning_rate': 1.7029622615988096e-05, 'epoch': 1.98}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2217, 'grad_norm': 0.503533124923706, 'learning_rate': 1.674782451868885e-05, 'epoch': 2.0}


 67%|██████▋   | 59143/88716 [1:03:39<31:38, 15.58it/s]  A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For

{'eval_loss': 0.23028716444969177, 'eval_rouge1': 0.17395024220231126, 'eval_rouge2': 0.0, 'eval_rougeL': 0.17388894913056713, 'eval_rougeLsum': 0.17393498375680405, 'eval_runtime': 39.5919, 'eval_samples_per_second': 126.288, 'eval_steps_per_second': 31.572, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2183, 'grad_norm': 0.5056673288345337, 'learning_rate': 1.6466026421389603e-05, 'epoch': 2.01}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2126, 'grad_norm': 0.48728707432746887, 'learning_rate': 1.6184228324090357e-05, 'epoch': 2.03}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2126, 'grad_norm': 0.5253824591636658, 'learning_rate': 1.590243022679111e-05, 'epoch': 2.05}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2128, 'grad_norm': 0.3702908456325531, 'learning_rate': 1.5620632129491864e-05, 'epoch': 2.06}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2134, 'grad_norm': 0.42876601219177246, 'learning_rate': 1.5338834032192615e-05, 'epoch': 2.08}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2159, 'grad_norm': 0.5121318697929382, 'learning_rate': 1.5057035934893368e-05, 'epoch': 2.1}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2116, 'grad_norm': 0.41245612502098083, 'learning_rate': 1.4775237837594122e-05, 'epoch': 2.11}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2129, 'grad_norm': 0.3997842073440552, 'learning_rate': 1.4493439740294872e-05, 'epoch': 2.13}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2157, 'grad_norm': 0.49955976009368896, 'learning_rate': 1.4211641642995626e-05, 'epoch': 2.15}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2114, 'grad_norm': 0.4798412322998047, 'learning_rate': 1.3929843545696381e-05, 'epoch': 2.16}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2146, 'grad_norm': 0.45423975586891174, 'learning_rate': 1.3648045448397135e-05, 'epoch': 2.18}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2139, 'grad_norm': 0.4256735146045685, 'learning_rate': 1.3366247351097885e-05, 'epoch': 2.2}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2134, 'grad_norm': 0.44102588295936584, 'learning_rate': 1.3084449253798639e-05, 'epoch': 2.21}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2127, 'grad_norm': 0.4008071720600128, 'learning_rate': 1.2802651156499393e-05, 'epoch': 2.23}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2134, 'grad_norm': 0.4376196563243866, 'learning_rate': 1.2520853059200144e-05, 'epoch': 2.25}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2153, 'grad_norm': 0.41455167531967163, 'learning_rate': 1.2239054961900898e-05, 'epoch': 2.27}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.213, 'grad_norm': 0.3527013957500458, 'learning_rate': 1.1957256864601652e-05, 'epoch': 2.28}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2139, 'grad_norm': 0.3855540454387665, 'learning_rate': 1.1675458767302404e-05, 'epoch': 2.3}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2116, 'grad_norm': 0.38335245847702026, 'learning_rate': 1.1393660670003156e-05, 'epoch': 2.32}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2106, 'grad_norm': 0.47380331158638, 'learning_rate': 1.111186257270391e-05, 'epoch': 2.33}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2125, 'grad_norm': 0.42736461758613586, 'learning_rate': 1.0830064475404663e-05, 'epoch': 2.35}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2124, 'grad_norm': 0.34354838728904724, 'learning_rate': 1.0548266378105417e-05, 'epoch': 2.37}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2121, 'grad_norm': 0.4898243546485901, 'learning_rate': 1.0266468280806169e-05, 'epoch': 2.38}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.214, 'grad_norm': 0.3731328547000885, 'learning_rate': 9.98467018350692e-06, 'epoch': 2.4}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2096, 'grad_norm': 0.3766745328903198, 'learning_rate': 9.702872086207674e-06, 'epoch': 2.42}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.208, 'grad_norm': 0.5006780028343201, 'learning_rate': 9.421073988908426e-06, 'epoch': 2.43}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2167, 'grad_norm': 0.6591842770576477, 'learning_rate': 9.139275891609182e-06, 'epoch': 2.45}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2142, 'grad_norm': 0.6788774728775024, 'learning_rate': 8.857477794309934e-06, 'epoch': 2.47}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2116, 'grad_norm': 0.3398102819919586, 'learning_rate': 8.575679697010686e-06, 'epoch': 2.49}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2119, 'grad_norm': 0.4023865759372711, 'learning_rate': 8.29388159971144e-06, 'epoch': 2.5}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2137, 'grad_norm': 0.34918487071990967, 'learning_rate': 8.012083502412191e-06, 'epoch': 2.52}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2148, 'grad_norm': 0.3837569057941437, 'learning_rate': 7.730285405112945e-06, 'epoch': 2.54}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2133, 'grad_norm': 0.40819448232650757, 'learning_rate': 7.448487307813698e-06, 'epoch': 2.55}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2129, 'grad_norm': 0.510757327079773, 'learning_rate': 7.1666892105144505e-06, 'epoch': 2.57}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2143, 'grad_norm': 0.43042635917663574, 'learning_rate': 6.884891113215204e-06, 'epoch': 2.59}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2138, 'grad_norm': 0.38393980264663696, 'learning_rate': 6.603093015915956e-06, 'epoch': 2.6}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2102, 'grad_norm': 0.31430312991142273, 'learning_rate': 6.321294918616711e-06, 'epoch': 2.62}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2119, 'grad_norm': 0.3349197506904602, 'learning_rate': 6.039496821317463e-06, 'epoch': 2.64}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2151, 'grad_norm': 0.5691472887992859, 'learning_rate': 5.757698724018215e-06, 'epoch': 2.65}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2121, 'grad_norm': 0.4123014509677887, 'learning_rate': 5.475900626718969e-06, 'epoch': 2.67}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2117, 'grad_norm': 0.42759546637535095, 'learning_rate': 5.194102529419722e-06, 'epoch': 2.69}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2153, 'grad_norm': 0.4149130880832672, 'learning_rate': 4.912304432120475e-06, 'epoch': 2.71}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2137, 'grad_norm': 0.3950582444667816, 'learning_rate': 4.6305063348212275e-06, 'epoch': 2.72}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2106, 'grad_norm': 0.37931862473487854, 'learning_rate': 4.34870823752198e-06, 'epoch': 2.74}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2127, 'grad_norm': 0.4859687387943268, 'learning_rate': 4.066910140222734e-06, 'epoch': 2.76}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2122, 'grad_norm': 0.49090462923049927, 'learning_rate': 3.785112042923487e-06, 'epoch': 2.77}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2116, 'grad_norm': 0.41118666529655457, 'learning_rate': 3.503313945624239e-06, 'epoch': 2.79}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.214, 'grad_norm': 0.3906489610671997, 'learning_rate': 3.221515848324992e-06, 'epoch': 2.81}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.21, 'grad_norm': 0.41768211126327515, 'learning_rate': 2.9397177510257453e-06, 'epoch': 2.82}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2105, 'grad_norm': 0.5797364711761475, 'learning_rate': 2.6579196537264985e-06, 'epoch': 2.84}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.213, 'grad_norm': 0.4997751712799072, 'learning_rate': 2.376121556427251e-06, 'epoch': 2.86}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2117, 'grad_norm': 0.41847527027130127, 'learning_rate': 2.094323459128004e-06, 'epoch': 2.87}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2137, 'grad_norm': 0.4477890729904175, 'learning_rate': 1.812525361828757e-06, 'epoch': 2.89}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.21, 'grad_norm': 0.4290620684623718, 'learning_rate': 1.53072726452951e-06, 'epoch': 2.91}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2128, 'grad_norm': 0.4764510691165924, 'learning_rate': 1.2489291672302628e-06, 'epoch': 2.93}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2101, 'grad_norm': 0.38757196068763733, 'learning_rate': 9.67131069931016e-07, 'epoch': 2.94}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2152, 'grad_norm': 0.3069728910923004, 'learning_rate': 6.853329726317688e-07, 'epoch': 2.96}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2085, 'grad_norm': 0.4379821717739105, 'learning_rate': 4.0353487533252175e-07, 'epoch': 2.98}


Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.214, 'grad_norm': 0.34743809700012207, 'learning_rate': 1.2173677803327473e-07, 'epoch': 2.99}


100%|█████████▉| 88715/88716 [1:37:01<00:00, 16.18it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For c

{'eval_loss': 0.22994141280651093, 'eval_rouge1': 0.17395024220231126, 'eval_rouge2': 0.0, 'eval_rougeL': 0.17388894913056713, 'eval_rougeLsum': 0.17393498375680405, 'eval_runtime': 40.1185, 'eval_samples_per_second': 124.631, 'eval_steps_per_second': 31.158, 'epoch': 3.0}
{'train_runtime': 5861.2596, 'train_samples_per_second': 60.543, 'train_steps_per_second': 15.136, 'train_loss': 0.23003953524107562, 'epoch': 3.0}





TrainOutput(global_step=88716, training_loss=0.23003953524107562, metrics={'train_runtime': 5861.2596, 'train_samples_per_second': 60.543, 'train_steps_per_second': 15.136, 'train_loss': 0.23003953524107562, 'epoch': 3.0})

In [16]:
trainer.save_model("models/ablation")

Non-default generation parameters: {'max_length': 256, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
