In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
import torchvision.transforms as transforms
import json
import os
import re
import datasets
import numpy as np
from transformers import VisionEncoderDecoderModel, AutoTokenizer, ViTModel, ViTImageProcessor, ViTFeatureExtractor
import wandb
from PIL import Image
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.is_available()

True

In [3]:
from PIL import PngImagePlugin
LARGE_ENOUGH_NUMBER = 1000
PngImagePlugin.MAX_TEXT_CHUNK = LARGE_ENOUGH_NUMBER * (1024**2)

In [5]:
raw_train_dataset = load_from_disk("processed_train")
raw_val_dataset = load_from_disk("processed_val")

In [6]:
train_dataset = raw_train_dataset
val_dataset = raw_val_dataset

ds = DatasetDict({'train':train_dataset, 'val':val_dataset})
ds

DatasetDict({
    train: Dataset({
        features: ['image_id', 'id', 'caption', 'img_path', 'is_file', 'raw_image'],
        num_rows: 118287
    })
    val: Dataset({
        features: ['image_id', 'id', 'caption', 'img_path', 'is_file', 'raw_image'],
        num_rows: 5000
    })
})

In [7]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

ViT =  ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained("google/vit-base-patch16-224-in21k", "gpt2")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_attn.weight', 'h.11.crossat

In [8]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

model.generation_config.pad_token_id = model.generation_config.eos_token_id

model.generation_config

GenerationConfig {
  "bos_token_id": 50256,
  "eos_token_id": 50256,
  "pad_token_id": 50256
}

In [9]:

def tokenization_fn(captions, max_target_length):
    labels = tokenizer(captions, 
                      padding="max_length", 
                      max_length=max_target_length, truncation=True).input_ids

    return labels

def feature_extraction_fn(images):
    encoder_inputs = feature_extractor(images=images, return_tensors="pt")
    return encoder_inputs.pixel_values

def preprocess_fn(examples, max_target_length, check_image = True):
    """Run tokenization + image feature extraction"""
    image_paths = examples['raw_image']
    captions = examples['caption']
    
    model_inputs = {}
    model_inputs['labels'] = tokenization_fn(captions, max_target_length)
    model_inputs['input_ids'] = model_inputs['labels']
    model_inputs['pixel_values'] = feature_extraction_fn(image_paths)

    return model_inputs

In [14]:
# processed_dataset = ds.map(
#     function=preprocess_fn,
#     batched=True,
#     fn_kwargs={"max_target_length": 128},
#     remove_columns=ds['train'].column_names
# )

# processed_dataset['val']

In [15]:
# processed_dataset.save_to_disk("processed_dataset")
processed_dataset = load_from_disk('processed_dataset')

In [16]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir="./image-captioning-output",
)


In [17]:
import evaluate
metric = evaluate.load("rouge")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 6.26MB/s]


In [18]:
import nltk

nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    preds = [pred[0].tolist() for pred in preds]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    assert len(decoded_preds) == len(decoded_labels)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [19]:
from transformers import default_data_collator, DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['val'],
    data_collator=default_data_collator,
)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [20]:
len(processed_dataset['val']['labels'])

5000

In [22]:
wandb.init(project='LLM_Project_few_shot')

trainer.train()


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.3361, 'grad_norm': 1.3982901573181152, 'learning_rate': 4.9718201902700756e-05, 'epoch': 0.02}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.279, 'grad_norm': 0.7383527755737305, 'learning_rate': 4.943640380540151e-05, 'epoch': 0.03}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.259, 'grad_norm': 0.6463015675544739, 'learning_rate': 4.915460570810226e-05, 'epoch': 0.05}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2515, 'grad_norm': 0.7266151905059814, 'learning_rate': 4.887280761080301e-05, 'epoch': 0.07}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2533, 'grad_norm': 0.5867251753807068, 'learning_rate': 4.8591009513503764e-05, 'epoch': 0.08}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2441, 'grad_norm': 0.6476061940193176, 'learning_rate': 4.830921141620452e-05, 'epoch': 0.1}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2421, 'grad_norm': 0.5616472959518433, 'learning_rate': 4.802741331890527e-05, 'epoch': 0.12}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.242, 'grad_norm': 0.572129487991333, 'learning_rate': 4.7745615221606025e-05, 'epoch': 0.14}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2424, 'grad_norm': 0.6156129837036133, 'learning_rate': 4.746381712430678e-05, 'epoch': 0.15}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2424, 'grad_norm': 0.5134930610656738, 'learning_rate': 4.718201902700753e-05, 'epoch': 0.17}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2368, 'grad_norm': 0.42386719584465027, 'learning_rate': 4.6900220929708286e-05, 'epoch': 0.19}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2354, 'grad_norm': 0.5927702188491821, 'learning_rate': 4.661842283240904e-05, 'epoch': 0.2}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2387, 'grad_norm': 0.5612953305244446, 'learning_rate': 4.633662473510979e-05, 'epoch': 0.22}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2324, 'grad_norm': 0.3923821747303009, 'learning_rate': 4.605482663781055e-05, 'epoch': 0.24}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2321, 'grad_norm': 0.5310181975364685, 'learning_rate': 4.57730285405113e-05, 'epoch': 0.25}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2314, 'grad_norm': 0.5603634715080261, 'learning_rate': 4.5491230443212054e-05, 'epoch': 0.27}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2283, 'grad_norm': 0.519157886505127, 'learning_rate': 4.52094323459128e-05, 'epoch': 0.29}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2287, 'grad_norm': 0.5082074999809265, 'learning_rate': 4.4927634248613555e-05, 'epoch': 0.3}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2262, 'grad_norm': 0.4193577468395233, 'learning_rate': 4.464583615131431e-05, 'epoch': 0.32}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2277, 'grad_norm': 0.46747201681137085, 'learning_rate': 4.436403805401506e-05, 'epoch': 0.34}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2247, 'grad_norm': 0.44232454895973206, 'learning_rate': 4.4082239956715816e-05, 'epoch': 0.36}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2254, 'grad_norm': 0.436957985162735, 'learning_rate': 4.380044185941657e-05, 'epoch': 0.37}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2227, 'grad_norm': 0.48230502009391785, 'learning_rate': 4.351864376211732e-05, 'epoch': 0.39}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2214, 'grad_norm': 0.4042932391166687, 'learning_rate': 4.323684566481807e-05, 'epoch': 0.41}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.221, 'grad_norm': 0.36510562896728516, 'learning_rate': 4.2955047567518824e-05, 'epoch': 0.42}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2268, 'grad_norm': 0.48114335536956787, 'learning_rate': 4.267324947021958e-05, 'epoch': 0.44}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.224, 'grad_norm': 0.4080374240875244, 'learning_rate': 4.239145137292033e-05, 'epoch': 0.46}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2212, 'grad_norm': 0.4371538460254669, 'learning_rate': 4.2109653275621085e-05, 'epoch': 0.47}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2175, 'grad_norm': 0.4427134394645691, 'learning_rate': 4.182785517832184e-05, 'epoch': 0.49}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2178, 'grad_norm': 0.486307829618454, 'learning_rate': 4.154605708102259e-05, 'epoch': 0.51}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2198, 'grad_norm': 0.3617169260978699, 'learning_rate': 4.126425898372334e-05, 'epoch': 0.52}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2193, 'grad_norm': 0.495869517326355, 'learning_rate': 4.098246088642409e-05, 'epoch': 0.54}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2239, 'grad_norm': 0.4173489511013031, 'learning_rate': 4.0700662789124846e-05, 'epoch': 0.56}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2197, 'grad_norm': 0.423566073179245, 'learning_rate': 4.04188646918256e-05, 'epoch': 0.57}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2237, 'grad_norm': 0.43673545122146606, 'learning_rate': 4.013706659452636e-05, 'epoch': 0.59}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2158, 'grad_norm': 0.4101102650165558, 'learning_rate': 3.9855268497227114e-05, 'epoch': 0.61}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2202, 'grad_norm': 0.4415844976902008, 'learning_rate': 3.957347039992786e-05, 'epoch': 0.63}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2135, 'grad_norm': 0.4533703327178955, 'learning_rate': 3.9291672302628614e-05, 'epoch': 0.64}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.217, 'grad_norm': 0.4737623631954193, 'learning_rate': 3.900987420532937e-05, 'epoch': 0.66}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2143, 'grad_norm': 0.4040898084640503, 'learning_rate': 3.872807610803012e-05, 'epoch': 0.68}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2175, 'grad_norm': 0.42051735520362854, 'learning_rate': 3.8446278010730875e-05, 'epoch': 0.69}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2183, 'grad_norm': 0.4339611530303955, 'learning_rate': 3.816447991343163e-05, 'epoch': 0.71}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2132, 'grad_norm': 0.4769306778907776, 'learning_rate': 3.788268181613238e-05, 'epoch': 0.73}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.216, 'grad_norm': 0.4850043058395386, 'learning_rate': 3.760088371883313e-05, 'epoch': 0.74}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2169, 'grad_norm': 0.5319119095802307, 'learning_rate': 3.731908562153388e-05, 'epoch': 0.76}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2123, 'grad_norm': 0.3597342371940613, 'learning_rate': 3.703728752423464e-05, 'epoch': 0.78}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2153, 'grad_norm': 0.4754571318626404, 'learning_rate': 3.675548942693539e-05, 'epoch': 0.79}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2187, 'grad_norm': 0.37053871154785156, 'learning_rate': 3.6473691329636144e-05, 'epoch': 0.81}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2161, 'grad_norm': 0.3678534924983978, 'learning_rate': 3.61918932323369e-05, 'epoch': 0.83}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2136, 'grad_norm': 0.4218001067638397, 'learning_rate': 3.591009513503765e-05, 'epoch': 0.85}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2175, 'grad_norm': 0.5089282393455505, 'learning_rate': 3.56282970377384e-05, 'epoch': 0.86}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2116, 'grad_norm': 0.38243356347084045, 'learning_rate': 3.534649894043915e-05, 'epoch': 0.88}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2105, 'grad_norm': 0.48003941774368286, 'learning_rate': 3.5064700843139906e-05, 'epoch': 0.9}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2139, 'grad_norm': 0.4613873064517975, 'learning_rate': 3.478290274584066e-05, 'epoch': 0.91}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2097, 'grad_norm': 0.4437251389026642, 'learning_rate': 3.450110464854141e-05, 'epoch': 0.93}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2135, 'grad_norm': 0.45312121510505676, 'learning_rate': 3.421930655124217e-05, 'epoch': 0.95}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.214, 'grad_norm': 0.41601482033729553, 'learning_rate': 3.393750845394292e-05, 'epoch': 0.96}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2129, 'grad_norm': 0.4991937279701233, 'learning_rate': 3.3655710356643674e-05, 'epoch': 0.98}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.2075, 'grad_norm': 0.33144691586494446, 'learning_rate': 3.337391225934443e-05, 'epoch': 1.0}


                                                         
 33%|███▎      | 29572/88716 [3:02:02<6:45:33,  2.43it/s]

{'eval_loss': 0.20669426023960114, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 530.3437, 'eval_samples_per_second': 9.428, 'eval_steps_per_second': 2.357, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1901, 'grad_norm': 0.44997766613960266, 'learning_rate': 3.309211416204518e-05, 'epoch': 1.01}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1949, 'grad_norm': 0.4474417567253113, 'learning_rate': 3.2810316064745935e-05, 'epoch': 1.03}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1935, 'grad_norm': 0.44159775972366333, 'learning_rate': 3.252851796744669e-05, 'epoch': 1.05}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1888, 'grad_norm': 0.5440500378608704, 'learning_rate': 3.224671987014744e-05, 'epoch': 1.07}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1918, 'grad_norm': 0.4325104057788849, 'learning_rate': 3.196492177284819e-05, 'epoch': 1.08}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1881, 'grad_norm': 0.37610626220703125, 'learning_rate': 3.168312367554894e-05, 'epoch': 1.1}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1954, 'grad_norm': 0.41580885648727417, 'learning_rate': 3.14013255782497e-05, 'epoch': 1.12}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1868, 'grad_norm': 0.555474042892456, 'learning_rate': 3.111952748095045e-05, 'epoch': 1.13}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1897, 'grad_norm': 0.5138987898826599, 'learning_rate': 3.0837729383651204e-05, 'epoch': 1.15}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1933, 'grad_norm': 0.387850284576416, 'learning_rate': 3.055593128635196e-05, 'epoch': 1.17}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.191, 'grad_norm': 0.4712672531604767, 'learning_rate': 3.027413318905271e-05, 'epoch': 1.18}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1922, 'grad_norm': 0.5343433022499084, 'learning_rate': 2.9992335091753458e-05, 'epoch': 1.2}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.189, 'grad_norm': 0.37976306676864624, 'learning_rate': 2.9710536994454212e-05, 'epoch': 1.22}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1928, 'grad_norm': 0.43054255843162537, 'learning_rate': 2.9428738897154965e-05, 'epoch': 1.23}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1891, 'grad_norm': 0.4187493622303009, 'learning_rate': 2.9146940799855723e-05, 'epoch': 1.25}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1918, 'grad_norm': 0.42251357436180115, 'learning_rate': 2.8865142702556476e-05, 'epoch': 1.27}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1894, 'grad_norm': 0.468263179063797, 'learning_rate': 2.858334460525723e-05, 'epoch': 1.28}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1923, 'grad_norm': 0.47681182622909546, 'learning_rate': 2.8301546507957983e-05, 'epoch': 1.3}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1887, 'grad_norm': 0.43642184138298035, 'learning_rate': 2.801974841065873e-05, 'epoch': 1.32}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1899, 'grad_norm': 0.4537320137023926, 'learning_rate': 2.7737950313359484e-05, 'epoch': 1.34}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1891, 'grad_norm': 0.5203537344932556, 'learning_rate': 2.7456152216060238e-05, 'epoch': 1.35}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.193, 'grad_norm': 0.383407324552536, 'learning_rate': 2.717435411876099e-05, 'epoch': 1.37}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1889, 'grad_norm': 0.544633150100708, 'learning_rate': 2.6892556021461745e-05, 'epoch': 1.39}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1882, 'grad_norm': 0.41278672218322754, 'learning_rate': 2.66107579241625e-05, 'epoch': 1.4}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1947, 'grad_norm': 0.4779995083808899, 'learning_rate': 2.632895982686325e-05, 'epoch': 1.42}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.188, 'grad_norm': 0.38078010082244873, 'learning_rate': 2.6047161729564003e-05, 'epoch': 1.44}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1894, 'grad_norm': 0.43340277671813965, 'learning_rate': 2.5765363632264756e-05, 'epoch': 1.45}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1919, 'grad_norm': 0.5159637928009033, 'learning_rate': 2.548356553496551e-05, 'epoch': 1.47}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.188, 'grad_norm': 0.38519737124443054, 'learning_rate': 2.5201767437666264e-05, 'epoch': 1.49}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1918, 'grad_norm': 0.5754150748252869, 'learning_rate': 2.4919969340367014e-05, 'epoch': 1.5}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1928, 'grad_norm': 0.36399585008621216, 'learning_rate': 2.4638171243067768e-05, 'epoch': 1.52}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.188, 'grad_norm': 0.521181583404541, 'learning_rate': 2.435637314576852e-05, 'epoch': 1.54}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.187, 'grad_norm': 0.5914561748504639, 'learning_rate': 2.4074575048469275e-05, 'epoch': 1.56}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1881, 'grad_norm': 0.46966785192489624, 'learning_rate': 2.3792776951170025e-05, 'epoch': 1.57}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1938, 'grad_norm': 0.516670286655426, 'learning_rate': 2.351097885387078e-05, 'epoch': 1.59}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.19, 'grad_norm': 0.4267532527446747, 'learning_rate': 2.3229180756571532e-05, 'epoch': 1.61}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1858, 'grad_norm': 0.3348756730556488, 'learning_rate': 2.2947382659272286e-05, 'epoch': 1.62}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1855, 'grad_norm': 0.4969191551208496, 'learning_rate': 2.266558456197304e-05, 'epoch': 1.64}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1911, 'grad_norm': 0.3993709683418274, 'learning_rate': 2.2383786464673793e-05, 'epoch': 1.66}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1872, 'grad_norm': 0.8678402900695801, 'learning_rate': 2.2101988367374547e-05, 'epoch': 1.67}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1875, 'grad_norm': 0.4510308504104614, 'learning_rate': 2.1820190270075297e-05, 'epoch': 1.69}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1871, 'grad_norm': 0.4402177035808563, 'learning_rate': 2.153839217277605e-05, 'epoch': 1.71}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.192, 'grad_norm': 0.4306100010871887, 'learning_rate': 2.1256594075476805e-05, 'epoch': 1.72}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1886, 'grad_norm': 0.5275710225105286, 'learning_rate': 2.0974795978177555e-05, 'epoch': 1.74}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1881, 'grad_norm': 0.3837248384952545, 'learning_rate': 2.069299788087831e-05, 'epoch': 1.76}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1885, 'grad_norm': 0.5092670321464539, 'learning_rate': 2.0411199783579062e-05, 'epoch': 1.78}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1899, 'grad_norm': 0.38401299715042114, 'learning_rate': 2.0129401686279813e-05, 'epoch': 1.79}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.186, 'grad_norm': 0.3830064535140991, 'learning_rate': 1.9847603588980566e-05, 'epoch': 1.81}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1876, 'grad_norm': 0.5236574411392212, 'learning_rate': 1.9565805491681323e-05, 'epoch': 1.83}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.186, 'grad_norm': 0.4073202311992645, 'learning_rate': 1.9284007394382077e-05, 'epoch': 1.84}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1879, 'grad_norm': 0.46731412410736084, 'learning_rate': 1.9002209297082827e-05, 'epoch': 1.86}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1854, 'grad_norm': 0.49356305599212646, 'learning_rate': 1.872041119978358e-05, 'epoch': 1.88}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1845, 'grad_norm': 0.45437806844711304, 'learning_rate': 1.8438613102484335e-05, 'epoch': 1.89}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1862, 'grad_norm': 0.5413724780082703, 'learning_rate': 1.8156815005185085e-05, 'epoch': 1.91}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1871, 'grad_norm': 0.5776885747909546, 'learning_rate': 1.787501690788584e-05, 'epoch': 1.93}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1882, 'grad_norm': 0.592287540435791, 'learning_rate': 1.7593218810586592e-05, 'epoch': 1.94}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1904, 'grad_norm': 0.3963894546031952, 'learning_rate': 1.7311420713287342e-05, 'epoch': 1.96}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1878, 'grad_norm': 0.34138256311416626, 'learning_rate': 1.7029622615988096e-05, 'epoch': 1.98}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.184, 'grad_norm': 0.5244980454444885, 'learning_rate': 1.674782451868885e-05, 'epoch': 2.0}


                                                         
 67%|██████▋   | 59144/88716 [6:14:30<2:21:34,  3.48it/s]

{'eval_loss': 0.1983156055212021, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 563.1462, 'eval_samples_per_second': 8.879, 'eval_steps_per_second': 2.22, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1745, 'grad_norm': 0.511502206325531, 'learning_rate': 1.6466026421389603e-05, 'epoch': 2.01}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1661, 'grad_norm': 0.5335368514060974, 'learning_rate': 1.6184228324090357e-05, 'epoch': 2.03}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1666, 'grad_norm': 0.5189568400382996, 'learning_rate': 1.590243022679111e-05, 'epoch': 2.05}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1663, 'grad_norm': 0.4113609194755554, 'learning_rate': 1.5620632129491864e-05, 'epoch': 2.06}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1654, 'grad_norm': 0.4923146367073059, 'learning_rate': 1.5338834032192615e-05, 'epoch': 2.08}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1688, 'grad_norm': 0.5579909682273865, 'learning_rate': 1.5057035934893368e-05, 'epoch': 2.1}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1631, 'grad_norm': 0.4670303463935852, 'learning_rate': 1.4775237837594122e-05, 'epoch': 2.11}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.165, 'grad_norm': 0.4938799738883972, 'learning_rate': 1.4493439740294872e-05, 'epoch': 2.13}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1677, 'grad_norm': 0.5732594728469849, 'learning_rate': 1.4211641642995626e-05, 'epoch': 2.15}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1646, 'grad_norm': 0.48316994309425354, 'learning_rate': 1.3929843545696381e-05, 'epoch': 2.16}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.166, 'grad_norm': 0.5659805536270142, 'learning_rate': 1.3648045448397135e-05, 'epoch': 2.18}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.167, 'grad_norm': 0.6050453186035156, 'learning_rate': 1.3366247351097885e-05, 'epoch': 2.2}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1664, 'grad_norm': 0.5606405735015869, 'learning_rate': 1.3084449253798639e-05, 'epoch': 2.21}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.165, 'grad_norm': 0.41840147972106934, 'learning_rate': 1.2802651156499393e-05, 'epoch': 2.23}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1659, 'grad_norm': 0.5842073559761047, 'learning_rate': 1.2520853059200144e-05, 'epoch': 2.25}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1681, 'grad_norm': 0.477468878030777, 'learning_rate': 1.2239054961900898e-05, 'epoch': 2.27}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.166, 'grad_norm': 0.3938906192779541, 'learning_rate': 1.1957256864601652e-05, 'epoch': 2.28}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1666, 'grad_norm': 0.42411988973617554, 'learning_rate': 1.1675458767302404e-05, 'epoch': 2.3}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1637, 'grad_norm': 0.405619740486145, 'learning_rate': 1.1393660670003156e-05, 'epoch': 2.32}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1638, 'grad_norm': 0.5985921621322632, 'learning_rate': 1.111186257270391e-05, 'epoch': 2.33}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1645, 'grad_norm': 0.5141904354095459, 'learning_rate': 1.0830064475404663e-05, 'epoch': 2.35}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1637, 'grad_norm': 0.45802581310272217, 'learning_rate': 1.0548266378105417e-05, 'epoch': 2.37}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1638, 'grad_norm': 0.5958636403083801, 'learning_rate': 1.0266468280806169e-05, 'epoch': 2.38}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1658, 'grad_norm': 0.4805629551410675, 'learning_rate': 9.98467018350692e-06, 'epoch': 2.4}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1626, 'grad_norm': 0.4302803575992584, 'learning_rate': 9.702872086207674e-06, 'epoch': 2.42}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1608, 'grad_norm': 0.5039312243461609, 'learning_rate': 9.421073988908426e-06, 'epoch': 2.43}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1676, 'grad_norm': 0.6907411813735962, 'learning_rate': 9.139275891609182e-06, 'epoch': 2.45}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1659, 'grad_norm': 0.771460235118866, 'learning_rate': 8.857477794309934e-06, 'epoch': 2.47}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1629, 'grad_norm': 0.4545251429080963, 'learning_rate': 8.575679697010686e-06, 'epoch': 2.49}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.163, 'grad_norm': 0.44772088527679443, 'learning_rate': 8.29388159971144e-06, 'epoch': 2.5}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1636, 'grad_norm': 0.40445712208747864, 'learning_rate': 8.012083502412191e-06, 'epoch': 2.52}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1642, 'grad_norm': 0.47454801201820374, 'learning_rate': 7.730285405112945e-06, 'epoch': 2.54}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.166, 'grad_norm': 0.45268696546554565, 'learning_rate': 7.448487307813698e-06, 'epoch': 2.55}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1642, 'grad_norm': 0.6417261362075806, 'learning_rate': 7.1666892105144505e-06, 'epoch': 2.57}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1646, 'grad_norm': 0.54203861951828, 'learning_rate': 6.884891113215204e-06, 'epoch': 2.59}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1651, 'grad_norm': 0.38989126682281494, 'learning_rate': 6.603093015915956e-06, 'epoch': 2.6}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.162, 'grad_norm': 0.39509254693984985, 'learning_rate': 6.321294918616711e-06, 'epoch': 2.62}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1614, 'grad_norm': 0.41618990898132324, 'learning_rate': 6.039496821317463e-06, 'epoch': 2.64}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1664, 'grad_norm': 0.7603390216827393, 'learning_rate': 5.757698724018215e-06, 'epoch': 2.65}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1637, 'grad_norm': 0.4695865213871002, 'learning_rate': 5.475900626718969e-06, 'epoch': 2.67}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1629, 'grad_norm': 0.4886704981327057, 'learning_rate': 5.194102529419722e-06, 'epoch': 2.69}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1655, 'grad_norm': 0.5467016696929932, 'learning_rate': 4.912304432120475e-06, 'epoch': 2.71}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1639, 'grad_norm': 0.5376049876213074, 'learning_rate': 4.6305063348212275e-06, 'epoch': 2.72}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1615, 'grad_norm': 0.4674364924430847, 'learning_rate': 4.34870823752198e-06, 'epoch': 2.74}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1638, 'grad_norm': 0.550269365310669, 'learning_rate': 4.066910140222734e-06, 'epoch': 2.76}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1633, 'grad_norm': 0.4826248288154602, 'learning_rate': 3.785112042923487e-06, 'epoch': 2.77}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1614, 'grad_norm': 0.4668891429901123, 'learning_rate': 3.503313945624239e-06, 'epoch': 2.79}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1649, 'grad_norm': 0.4618733823299408, 'learning_rate': 3.221515848324992e-06, 'epoch': 2.81}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1601, 'grad_norm': 0.4593704342842102, 'learning_rate': 2.9397177510257453e-06, 'epoch': 2.82}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1623, 'grad_norm': 0.5321261882781982, 'learning_rate': 2.6579196537264985e-06, 'epoch': 2.84}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1641, 'grad_norm': 0.5136831998825073, 'learning_rate': 2.376121556427251e-06, 'epoch': 2.86}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1611, 'grad_norm': 0.4757481813430786, 'learning_rate': 2.094323459128004e-06, 'epoch': 2.87}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1649, 'grad_norm': 0.543576180934906, 'learning_rate': 1.812525361828757e-06, 'epoch': 2.89}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1608, 'grad_norm': 0.44154834747314453, 'learning_rate': 1.53072726452951e-06, 'epoch': 2.91}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1637, 'grad_norm': 0.5305308699607849, 'learning_rate': 1.2489291672302628e-06, 'epoch': 2.93}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1614, 'grad_norm': 0.4517044126987457, 'learning_rate': 9.67131069931016e-07, 'epoch': 2.94}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1653, 'grad_norm': 0.49851664900779724, 'learning_rate': 6.853329726317688e-07, 'epoch': 2.96}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1599, 'grad_norm': 0.5124663710594177, 'learning_rate': 4.0353487533252175e-07, 'epoch': 2.98}


Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}


{'loss': 0.1644, 'grad_norm': 0.49795299768447876, 'learning_rate': 1.2173677803327473e-07, 'epoch': 2.99}


                                                       
100%|██████████| 88716/88716 [9:29:45<00:00,  2.60it/s]

{'eval_loss': 0.1996726244688034, 'eval_rouge1': 0.0, 'eval_rouge2': 0.0, 'eval_rougeL': 0.0, 'eval_rougeLsum': 0.0, 'eval_runtime': 562.1865, 'eval_samples_per_second': 8.894, 'eval_steps_per_second': 2.223, 'epoch': 3.0}
{'train_runtime': 34185.2513, 'train_samples_per_second': 10.381, 'train_steps_per_second': 2.595, 'train_loss': 0.19352276053579792, 'epoch': 3.0}





TrainOutput(global_step=88716, training_loss=0.19352276053579792, metrics={'train_runtime': 34185.2513, 'train_samples_per_second': 10.381, 'train_steps_per_second': 2.595, 'train_loss': 0.19352276053579792, 'epoch': 3.0})

In [23]:
trainer.save_model("models/full_train")

Non-default generation parameters: {'max_length': 128, 'early_stopping': True, 'num_beams': 4, 'length_penalty': 2.0, 'no_repeat_ngram_size': 3}
